def _cleanlist(self, listvids): resultlist = [] for vid in listvids: assert isinstance(vid, dict) vid.setdefault(vid.keys()[0]) url = HTMLParser().unescape(vid.get('url')) thumb = HTMLParser().unescape(vid.get('thumb')) label = HTMLParser().unescape(vid.get('label')) upr = urlparse.urlparse(self.url) vbase = upr.scheme + '://' + upr.netloc + '/' if not url.startswith('http'): url = urlparse.urlparse(vbase + url.lstrip('/')).geturl() if not thumb.startswith('http'): thumb = urlparse.urlparse(vbase + thumb.lstrip('/')).geturl() if thumb.endswith('.jpg') or thumb.endswith('.png') or thumb.endswith('.jpeg'): newvid = dict(url=url, thumb=thumb, label=label) newvid.setdefault(newvid.keys()[0]) resultlist.append(newvid) return resultlist
def clean_html(value): """ """ if clean_html and value: # we need a surrounding <p></p> or the content is not generated by appy.pod if not value.startswith(u'<p>') or not value.endswith(u'</p>'): value = u'<p>%s</p>' % value soup = BeautifulSoup(safe_unicode(value)) soup_contents = soup.renderContents() if not isinstance(soup_contents, unicode): soup_contents = safe_unicode(soup_contents) # clean HTML with HTMLParser, it will remove special entities like   soup_contents = HTMLParser().unescape(soup_contents) # clean HTML with lxml Cleaner cleaner = Cleaner() soup_contents = cleaner.clean_html(soup_contents) # clean_html surrounds the cleaned HTML with <div>...</div>... removes it! if soup_contents.startswith(u'<div>') and soup_contents.endswith( u'</div>'): soup_contents = soup_contents[5:-6] if not soup_contents == value: value = soup_contents return value
def magnet2resp(magnet_u, url_discovery='unknown', info={}, webcache=True, allow_missing=True): "Return a Response and known_data with all the info from the magnet link" for x, y in [('&', '&'), ('<', '<'), ('>', '>')]: magnet_u = magnet_u.replace(x, y) parts = parse_qs(magnet_u[len('magnet:?'):]) # extract the sections try: if "&" in parts['dn'][0]: magnet = HTMLParser().unescape(unquote(magnet_u)).encode('utf-8') else: magnet = magnet_u.encode( 'utf-8') # parse_q doesn't work with unicodes magnet = unquote(magnet) except: print "err" magnet = magnet_u.encode('utf-8') # parse_q doesn't work with unicodes magnet = unquote(magnet) if not magnet.startswith('magnet:?'): raise RuntimeError('Does not look like a magnet link: %s' % magnet) parts = parse_qs(magnet[len('magnet:?'):]) # extract the sections xt = parts['xt'][0] if not xt.startswith('urn:btih:'): raise RuntimeError('Magnet link in unexpected format: %s' % xt) # urn:btih -> urn:sha1 btih = xt.split(':')[-1].upper() try: bth_32 = base64.b32encode(base64.b16decode(btih)) bth_16 = btih except TypeError: # backwards compatibility with clients that use a Base32 hash bth_32 = btih bth_16 = base64.b16encode(base64.b32decode(btih)) # Get extra info from torcache if webcache: for cache_site in ['torcache.net', 'zoink.it']: # We could also use torra.ws, but the result is not gzipped try: url = 'http://%s/torrent/%s.torrent' % (cache_site, bth_16) data = StringIO(urllib2.urlopen(url, timeout=1).read()) info_webcache = torrent_info(GzipFile(fileobj=data).read()) info_webcache.pop('comment', None) # it's useless log.msg('Got extra info from %s!' % cache_site) break except Exception as e: # TODO: be less inclusive log.msg('Error when asking %s: %s' % (cache_site, e), log.WARNING) else: # none of the cache sites worked info_webcache = {} else: info_webcache = {} # Find its name if 'dn' in parts: #~ cdt = chardet.detect(parts['dn'][0])["encoding"] fname = parts['dn'][0].decode('utf-8') elif 'filedir' in info_webcache: fname = info_webcache['filedir'] elif 'filepaths' in info_webcache: fname = info_webcache['filepaths'] else: # buuuh, a magnet with no name! message = 'Magnet link has no name ("dn"): %s' % magnet if allow_missing: log.msg(message, log.WARNING) fname = '' else: raise RuntimeError(message) # Get its size if possible if 'size' in info_webcache: size = info_webcache.pop('size') else: size = 0 # we don't know its size # Get all the trackers that make sense trackers = set() if 'trackers' in info_webcache: trackers |= set(info_webcache.pop('trackers').split()) if 'tr' in parts: trackers |= set(parts['tr']) if not trackers: # no trackers? what kind of a magnet is that, buddy? message = 'Magnet link has no trackers ("tr"): %s' % magnet if allow_missing: log.msg(message, log.WARNING) else: raise RuntimeError(message) # Store all the information and get ready to return it info_local = info.copy() info_local.update({'torrent:%s' % k: v for k, v in info_webcache.items()}) info_local['torrent:trackers'] = ' '.join(trackers) known_data = [fname, size, info_local] # Hack. We put the BTH as the "url" and it will appear in the log: # 7:K7RBZRI5OXRIPBCWVMPSEEH4NJR6PG2V # or something like that meta = {'url_discovery': url_discovery, 'info': {}, 'url4mysql': magnet} fake_response = HtmlResponse(url=bth_32, request=Request('http://x.y', meta=meta)) return fake_response, known_data