Beispiel #1
0
 def _cleanlist(self, listvids):
     resultlist = []
     for vid in listvids:
         assert isinstance(vid, dict)
         vid.setdefault(vid.keys()[0])
         url = HTMLParser().unescape(vid.get('url'))
         thumb = HTMLParser().unescape(vid.get('thumb'))
         label = HTMLParser().unescape(vid.get('label'))
         upr = urlparse.urlparse(self.url)
         vbase = upr.scheme + '://' + upr.netloc + '/'
         if not url.startswith('http'):
             url = urlparse.urlparse(vbase + url.lstrip('/')).geturl()
         if not thumb.startswith('http'):
             thumb = urlparse.urlparse(vbase + thumb.lstrip('/')).geturl()
         if thumb.endswith('.jpg') or thumb.endswith('.png') or thumb.endswith('.jpeg'):
             newvid = dict(url=url, thumb=thumb, label=label)
             newvid.setdefault(newvid.keys()[0])
             resultlist.append(newvid)
     return resultlist
Beispiel #2
0
def clean_html(value):
    """ """
    if clean_html and value:
        # we need a surrounding <p></p> or the content is not generated by appy.pod
        if not value.startswith(u'<p>') or not value.endswith(u'</p>'):
            value = u'<p>%s</p>' % value
        soup = BeautifulSoup(safe_unicode(value))
        soup_contents = soup.renderContents()
        if not isinstance(soup_contents, unicode):
            soup_contents = safe_unicode(soup_contents)
        # clean HTML with HTMLParser, it will remove special entities like &#xa0;
        soup_contents = HTMLParser().unescape(soup_contents)
        # clean HTML with lxml Cleaner
        cleaner = Cleaner()
        soup_contents = cleaner.clean_html(soup_contents)
        # clean_html surrounds the cleaned HTML with <div>...</div>... removes it!
        if soup_contents.startswith(u'<div>') and soup_contents.endswith(
                u'</div>'):
            soup_contents = soup_contents[5:-6]
        if not soup_contents == value:
            value = soup_contents
    return value
Beispiel #3
0
def magnet2resp(magnet_u,
                url_discovery='unknown',
                info={},
                webcache=True,
                allow_missing=True):
    "Return a Response and known_data with all the info from the magnet link"

    for x, y in [('&amp;', '&'), ('&lt;', '<'), ('&gt;', '>')]:
        magnet_u = magnet_u.replace(x, y)

    parts = parse_qs(magnet_u[len('magnet:?'):])  # extract the sections
    try:
        if "&" in parts['dn'][0]:
            magnet = HTMLParser().unescape(unquote(magnet_u)).encode('utf-8')
        else:
            magnet = magnet_u.encode(
                'utf-8')  # parse_q doesn't work with unicodes
            magnet = unquote(magnet)

    except:
        print "err"
        magnet = magnet_u.encode('utf-8')  # parse_q doesn't work with unicodes
        magnet = unquote(magnet)

    if not magnet.startswith('magnet:?'):
        raise RuntimeError('Does not look like a magnet link: %s' % magnet)
    parts = parse_qs(magnet[len('magnet:?'):])  # extract the sections

    xt = parts['xt'][0]
    if not xt.startswith('urn:btih:'):
        raise RuntimeError('Magnet link in unexpected format: %s' % xt)

    # urn:btih -> urn:sha1
    btih = xt.split(':')[-1].upper()
    try:
        bth_32 = base64.b32encode(base64.b16decode(btih))
        bth_16 = btih
    except TypeError:
        # backwards compatibility with clients that use a Base32 hash
        bth_32 = btih
        bth_16 = base64.b16encode(base64.b32decode(btih))

    # Get extra info from torcache
    if webcache:
        for cache_site in ['torcache.net', 'zoink.it']:
            # We could also use torra.ws, but the result is not gzipped
            try:
                url = 'http://%s/torrent/%s.torrent' % (cache_site, bth_16)
                data = StringIO(urllib2.urlopen(url, timeout=1).read())
                info_webcache = torrent_info(GzipFile(fileobj=data).read())
                info_webcache.pop('comment', None)  # it's useless
                log.msg('Got extra info from %s!' % cache_site)
                break
            except Exception as e:  # TODO: be less inclusive
                log.msg('Error when asking %s: %s' % (cache_site, e),
                        log.WARNING)
        else:  # none of the cache sites worked
            info_webcache = {}
    else:
        info_webcache = {}

    # Find its name
    if 'dn' in parts:
        #~ cdt = chardet.detect(parts['dn'][0])["encoding"]
        fname = parts['dn'][0].decode('utf-8')
    elif 'filedir' in info_webcache:
        fname = info_webcache['filedir']
    elif 'filepaths' in info_webcache:
        fname = info_webcache['filepaths']
    else:  # buuuh, a magnet with no name!
        message = 'Magnet link has no name ("dn"): %s' % magnet
        if allow_missing:
            log.msg(message, log.WARNING)
            fname = ''
        else:
            raise RuntimeError(message)

    # Get its size if possible
    if 'size' in info_webcache:
        size = info_webcache.pop('size')
    else:
        size = 0  # we don't know its size

    # Get all the trackers that make sense
    trackers = set()
    if 'trackers' in info_webcache:
        trackers |= set(info_webcache.pop('trackers').split())
    if 'tr' in parts:
        trackers |= set(parts['tr'])
    if not trackers:  # no trackers? what kind of a magnet is that, buddy?
        message = 'Magnet link has no trackers ("tr"): %s' % magnet
        if allow_missing:
            log.msg(message, log.WARNING)
        else:
            raise RuntimeError(message)

    # Store all the information and get ready to return it
    info_local = info.copy()
    info_local.update({'torrent:%s' % k: v for k, v in info_webcache.items()})
    info_local['torrent:trackers'] = ' '.join(trackers)

    known_data = [fname, size, info_local]

    # Hack. We put the BTH as the "url" and it will appear in the log:
    #   7:K7RBZRI5OXRIPBCWVMPSEEH4NJR6PG2V
    # or something like that
    meta = {'url_discovery': url_discovery, 'info': {}, 'url4mysql': magnet}
    fake_response = HtmlResponse(url=bth_32,
                                 request=Request('http://x.y', meta=meta))

    return fake_response, known_data