def _Clean(url): """Clean the contents of a given URL to only the "readable part". Handle special cases like YouTube, PDF, images directly. Delegate out to either extract content from the site's feed, or parse and clean the HTML. Args: url: String, the URL to the interesting content. Returns: Tuple of strings: (final URL after redirects, HTML of the "readable part"). """ # Handle de-facto standard "hash bang" URLs ( http://goo.gl/LNmg ) url = url.replace('#!', '?_escaped_fragment_=') # Otherwise ignore fragments. url = re.sub(r'#.*', '', url) # And strip common tracking noise. url = re.sub(r'[?&]utm_[^&]+', '', url) match = re.search(r'^https?://docs.google.com.*cache:.*?:(.*?\.pdf)', url, re.I) if match: url = match.group(1) if 'http' not in url: url = 'http://' + url match = re.search(r'^https?://docs.google.com.*docid=(.*?)(&|$)', url, re.I) if match: _TrackClean('direct_google_docs') html = util.RenderTemplate('google-docs.html', { 'docid': match.group(1), 'url': url }) return url, html if re.search(r'^https?://www\.youtube\.com/watch', url, re.I): _TrackClean('direct_youtube') video_id = re.search(r'v=([^&]+)', url).group(1) return url, util.RenderTemplate('youtube.html', {'video_id': video_id}) elif re.search(r'\.pdf(\?|$)', url, re.I): _TrackClean('direct_pdf') return url, util.RenderTemplate('pdf.html', {'url': url}) elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I): _TrackClean('direct_image') return url, util.RenderTemplate('image.html', {'url': url}) try: response, final_url = util.Fetch(url) except urlfetch.DownloadError, error: _TrackClean('error') logging.error(error) return url, u'Download error: %s' % error
def get(self): types = ('direct_google_docs', 'direct_youtube', 'direct_trutv', 'direct_pdf', 'direct_image', 'error', 'feed', 'content') stats = [(type, memcache.get('cleaned_%s' % type)) for type in types] self.response.headers['Content-Type'] = 'text/html' self.response.out.write( util.RenderTemplate('main.html', {'stats': stats}))
def PrintFeed(feed_entity, include_original=False): if not feed_entity.entries: feed_entity = { 'title': feed_entity.title, 'link': feed_entity.title, 'entries': [_EMPTY_ENTRY], } return util.RenderTemplate( 'feed.xml', {'feed': feed_entity, 'include_original': include_original})
_TrackClean('direct_pdf') return url, util.RenderTemplate('pdf.html', {'url': url}) elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I): _TrackClean('direct_image') return url, util.RenderTemplate('image.html', {'url': url}) try: response, final_url = util.Fetch(url) except urlfetch.DownloadError, error: _TrackClean('error') logging.error(error) return url, u'Download error: %s' % error if 'application/pdf' == response.headers.get('content-type', None): _TrackClean('direct_pdf') return url, util.RenderTemplate('pdf.html', {'url': url}) note = '' try: if 'reddit.com/' in url: raise extract_feed.RssError extractor = extract_feed.FeedExtractor(url=url, final_url=final_url, html=response.content) note = 'cleaned feed' soup = extractor.soup tag = soup _TrackClean('feed') except extract_feed.RssError as e: note = 'cleaned content, %s, %s' % (e.__class__.__name__, e) soup, tag = extract_content.ExtractFromHtml(final_url, response.content)
def get(self): self.response.headers['Content-Type'] = 'text/html' self.response.out.write(util.RenderTemplate('main.html'))
def _Clean(url): """Clean the contents of a given URL to only the "readable part". Handle special cases like YouTube, PDF, images directly. Delegate out to either extract content from the site's feed, or parse and clean the HTML. Args: url: String, the URL to the interesting content. Returns: String: HTML representing the "readable part". """ # Handle de-facto standard "hash bang" URLs ( http://goo.gl/LNmg ) url = url.replace('#!', '?_escaped_fragment_=') # Otherwise ignore fragments. url = re.sub(r'#.*', '', url) # And strip common tracking noise. url = re.sub(r'[?&]utm_[^&]+', '', url) match = re.search(r'^https?://docs.google.com.*cache:.*?:(.*?\.pdf)', url, re.I) if match: url = match.group(1) if 'http' not in url: url = 'http://' + url match = re.search(r'^https?://docs.google.com.*docid=(.*?)(&|$)', url, re.I) if match: _TrackClean('direct_google_docs') return util.RenderTemplate('google-docs.html', { 'docid': match.group(1), 'url': url }) if re.search(r'^http://www\.youtube\.com/watch', url, re.I): _TrackClean('direct_youtube') video_id = re.search(r'v=([^&]+)', url).group(1) return util.RenderTemplate('youtube.html', {'video_id': video_id}) if re.search(r'^http://www\.trutv\.com/video', url, re.I): _TrackClean('direct_trutv') video_id = re.search(r'(/video[^?#]+).html', url).group(1) return util.RenderTemplate('trutv.html', {'video_id': video_id}) elif re.search(r'\.pdf(\?|$)', url, re.I): _TrackClean('direct_pdf') return util.RenderTemplate('pdf.html', {'url': url}) elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I): _TrackClean('direct_image') return util.RenderTemplate('image.html', {'url': url}) html, final_url, error = util.Fetch(url) if error: _TrackClean('error') logging.error(error) return u'Download error: %s' % error note = '' try: extractor = extract_feed.FeedExtractor(url=url, final_url=final_url, html=html) note = 'cleaned feed' soup = extractor.soup tag = soup _TrackClean('feed') except extract_feed.RssError, e: note = 'cleaned content, %s, %s' % (e.__class__.__name__, e) soup, tag = extract_content.ExtractFromHtml(final_url, html) _TrackClean('content')