Esempio n. 1
0
def _Clean(url):
    """Clean the contents of a given URL to only the "readable part".

  Handle special cases like YouTube, PDF, images directly.  Delegate out to
  either extract content from the site's feed, or parse and clean the HTML.

  Args:
    url: String, the URL to the interesting content.

  Returns:
    Tuple of strings: (final URL after redirects, HTML of the "readable part").
  """
    # Handle de-facto standard "hash bang" URLs ( http://goo.gl/LNmg )
    url = url.replace('#!', '?_escaped_fragment_=')
    # Otherwise ignore fragments.
    url = re.sub(r'#.*', '', url)
    # And strip common tracking noise.
    url = re.sub(r'[?&]utm_[^&]+', '', url)

    match = re.search(r'^https?://docs.google.com.*cache:.*?:(.*?\.pdf)', url,
                      re.I)
    if match:
        url = match.group(1)
        if 'http' not in url:
            url = 'http://' + url

    match = re.search(r'^https?://docs.google.com.*docid=(.*?)(&|$)', url,
                      re.I)
    if match:
        _TrackClean('direct_google_docs')
        html = util.RenderTemplate('google-docs.html', {
            'docid': match.group(1),
            'url': url
        })
        return url, html

    if re.search(r'^https?://www\.youtube\.com/watch', url, re.I):
        _TrackClean('direct_youtube')
        video_id = re.search(r'v=([^&]+)', url).group(1)
        return url, util.RenderTemplate('youtube.html', {'video_id': video_id})
    elif re.search(r'\.pdf(\?|$)', url, re.I):
        _TrackClean('direct_pdf')
        return url, util.RenderTemplate('pdf.html', {'url': url})
    elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I):
        _TrackClean('direct_image')
        return url, util.RenderTemplate('image.html', {'url': url})

    try:
        response, final_url = util.Fetch(url)
    except urlfetch.DownloadError, error:
        _TrackClean('error')
        logging.error(error)
        return url, u'Download error: %s' % error
Esempio n. 2
0
 def get(self):
     types = ('direct_google_docs', 'direct_youtube', 'direct_trutv',
              'direct_pdf', 'direct_image', 'error', 'feed', 'content')
     stats = [(type, memcache.get('cleaned_%s' % type)) for type in types]
     self.response.headers['Content-Type'] = 'text/html'
     self.response.out.write(
         util.RenderTemplate('main.html', {'stats': stats}))
Esempio n. 3
0
def PrintFeed(feed_entity, include_original=False):
  if not feed_entity.entries:
    feed_entity = {
        'title': feed_entity.title,
        'link': feed_entity.title,
        'entries': [_EMPTY_ENTRY],
        }
  return util.RenderTemplate(
      'feed.xml', {'feed': feed_entity, 'include_original': include_original})
Esempio n. 4
0
        _TrackClean('direct_pdf')
        return url, util.RenderTemplate('pdf.html', {'url': url})
    elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I):
        _TrackClean('direct_image')
        return url, util.RenderTemplate('image.html', {'url': url})

    try:
        response, final_url = util.Fetch(url)
    except urlfetch.DownloadError, error:
        _TrackClean('error')
        logging.error(error)
        return url, u'Download error: %s' % error

    if 'application/pdf' == response.headers.get('content-type', None):
        _TrackClean('direct_pdf')
        return url, util.RenderTemplate('pdf.html', {'url': url})

    note = ''
    try:
        if 'reddit.com/' in url: raise extract_feed.RssError
        extractor = extract_feed.FeedExtractor(url=url,
                                               final_url=final_url,
                                               html=response.content)
        note = 'cleaned feed'
        soup = extractor.soup
        tag = soup
        _TrackClean('feed')
    except extract_feed.RssError as e:
        note = 'cleaned content, %s, %s' % (e.__class__.__name__, e)
        soup, tag = extract_content.ExtractFromHtml(final_url,
                                                    response.content)
Esempio n. 5
0
 def get(self):
     self.response.headers['Content-Type'] = 'text/html'
     self.response.out.write(util.RenderTemplate('main.html'))
Esempio n. 6
0
def _Clean(url):
    """Clean the contents of a given URL to only the "readable part".

  Handle special cases like YouTube, PDF, images directly.  Delegate out to
  either extract content from the site's feed, or parse and clean the HTML.

  Args:
    url: String, the URL to the interesting content.

  Returns:
    String: HTML representing the "readable part".
  """
    # Handle de-facto standard "hash bang" URLs ( http://goo.gl/LNmg )
    url = url.replace('#!', '?_escaped_fragment_=')
    # Otherwise ignore fragments.
    url = re.sub(r'#.*', '', url)
    # And strip common tracking noise.
    url = re.sub(r'[?&]utm_[^&]+', '', url)

    match = re.search(r'^https?://docs.google.com.*cache:.*?:(.*?\.pdf)', url,
                      re.I)
    if match:
        url = match.group(1)
        if 'http' not in url:
            url = 'http://' + url

    match = re.search(r'^https?://docs.google.com.*docid=(.*?)(&|$)', url,
                      re.I)
    if match:
        _TrackClean('direct_google_docs')
        return util.RenderTemplate('google-docs.html', {
            'docid': match.group(1),
            'url': url
        })

    if re.search(r'^http://www\.youtube\.com/watch', url, re.I):
        _TrackClean('direct_youtube')
        video_id = re.search(r'v=([^&]+)', url).group(1)
        return util.RenderTemplate('youtube.html', {'video_id': video_id})
    if re.search(r'^http://www\.trutv\.com/video', url, re.I):
        _TrackClean('direct_trutv')
        video_id = re.search(r'(/video[^?#]+).html', url).group(1)
        return util.RenderTemplate('trutv.html', {'video_id': video_id})
    elif re.search(r'\.pdf(\?|$)', url, re.I):
        _TrackClean('direct_pdf')
        return util.RenderTemplate('pdf.html', {'url': url})
    elif re.search(r'\.(gif|jpe?g|png)(\?|$)', url, re.I):
        _TrackClean('direct_image')
        return util.RenderTemplate('image.html', {'url': url})

    html, final_url, error = util.Fetch(url)
    if error:
        _TrackClean('error')
        logging.error(error)
        return u'Download error: %s' % error

    note = ''
    try:
        extractor = extract_feed.FeedExtractor(url=url,
                                               final_url=final_url,
                                               html=html)
        note = 'cleaned feed'
        soup = extractor.soup
        tag = soup
        _TrackClean('feed')
    except extract_feed.RssError, e:
        note = 'cleaned content, %s, %s' % (e.__class__.__name__, e)
        soup, tag = extract_content.ExtractFromHtml(final_url, html)
        _TrackClean('content')