Exemple #1
0
def OEmbedFixup(soup):
    oembed_links = soup.findAll(
        'a', {'onclick': re.compile(r'^oEmbedManagerVideoLoader')})
    for oembed_link in oembed_links:
        cont = oembed_link.parent
        embed = cont.find('iframe')
        if not embed:
            ta = cont.find('textarea')
            if not ta: return
            s = BeautifulSoup.BeautifulSoup(ta.text)
            embed = s.find('iframe')
        embed['src'] = re.sub(r'\?.*', '', embed['src'])
        div = BeautifulSoup.Tag(soup, 'div')
        div.insert(0, embed)
        cont.replaceWith(div)
def _Munge(soup, tag, url):
    """Given a string of HTML content, munge it to be more pleasing."""
    # In certain failure cases, we'll still get a string.  Just use it.
    if isinstance(tag, basestring):
        return tag

    _MungeStripSiteSpecific(tag, url)
    _MungeStripBrsAfterPs(tag)
    _MungeStripRules(tag)
    _MungeStripEmpties(tag)
    tag = _MungeStripRootContainers(tag)
    _MungeStripLowScored(tag)
    _MungeStripAttrs(tag)

    _FixUrls(tag, url)
    _MungeImages(tag)
    _MungeHeaderDowngrade(tag)
    _MungeHyphenate(tag)
    _MungeNoscript(tag)
    tag = _MungeTransformEmbeds(soup, tag)

    # Serialize the tag, and apply full justification.
    if isinstance(tag, BeautifulSoup.BeautifulStoneSoup):
        # Wrap in a div, to have a tag to justify, if necessary.
        wrap = BeautifulSoup.Tag(soup, 'div')
        wrap.insert(0, tag)
        tag = wrap
    tag['style'] = 'text-align: justify;'

    return unicode(tag)
Exemple #3
0
def _ExtractFromHtmlGeneric(url, html):
  html = util.PreCleanHtml(html)
  try:
    soup = BeautifulSoup.BeautifulSoup(html)
  except HTMLParser.HTMLParseError, e:
    logging.exception(e)
    return soup, u''
Exemple #4
0
def _TransformBrsToParagraphsInner(soup, tag):
  next = tag
  while True:
    next = next.nextSibling
    if not next:
      return
    if isinstance(next, BeautifulSoup.Tag):
      if next.name == 'br':
        break
      else:
        return
    elif isinstance(next, BeautifulSoup.NavigableString):
      if not unicode(next).strip():
        continue
      else:
        return

  contents = []
  prev = tag
  while True:
    prev = prev.previousSibling
    if not prev: break
    if hasattr(prev, 'name') and prev.name in BR_TO_P_STOP_TAGS: break
    contents.insert(0, prev)

  newp = BeautifulSoup.Tag(soup, 'p')
  for i, newtag in enumerate(contents):
    newp.insert(i, newtag)
  next.extract()
  tag.replaceWith(newp)
def _MungeTransformEmbeds(soup, root_tag):
    for tag in util.FindEmbeds(root_tag):
        try:
            w, h = util.TagSize(root_tag)
        except TypeError:
            w = 600
            h = 400
        link = BeautifulSoup.Tag(soup, 'a')
        link['href'] = 'data:text/html;base64,' + base64.b64encode(
            '<body style="margin:0;">%s</body>' % unicode(tag))
        link['rel'] = 'embedded_media'
        link['embed_width'] = w
        link['embed_height'] = h
        img = BeautifulSoup.Tag(soup, 'img')
        img['src'] = 'http://readability-api.appspot.com/embedded_media.png'
        img['width'] = '128'
        img['height'] = '128'
        link.insert(0, img)
        if tag == root_tag:
            return link
        tag.replaceWith(link)
    return root_tag
Exemple #6
0
def ExtractFromHtml(url, html):
    """Given a string of HTML, remove nasty bits, score and pick bit to keep."""
    if re.search(r'^http://(www\.)?reddit\.com/.*/comments/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(
            attrs={'class': re.compile(r'thing.*link|usertext border')})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        body = soup.find(attrs={'class': re.compile(r'\busertext-body\b')})
        if not body:
            body = soup.find('a', attrs={'class': re.compile(r'\btitle\b')})
            body = body and body.text or soup
        return soup, body
    elif re.search(r'^http://(www\.)?xkcd\.com/\d+', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        img = soup.find(alt=True, title=True)
        cont = img.parent.parent
        for tag in cont.findAll(('br', 'div')):
            util.Strip(tag)
        return soup, cont
    elif re.search(r'^http://groups\.google\.com/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(attrs={'class': 'maincontbox'})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'^http://(www\.)?nytimes\.com/', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        bodies = soup.findAll(attrs={'class': 'articleBody'})
        if bodies:
            # Put the first "article body" contents into the second -- for some
            # reason NYT splits the lead-in text into its own "body".
            while bodies[0].contents:
                bodies[1].insert(0, bodies[0].contents[-1])
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'\.txt(\?|$)', url, re.I):
        soup = BeautifulSoup.BeautifulSoup()
        pre = BeautifulSoup.Tag(soup, 'pre')
        pre.insert(0, BeautifulSoup.NavigableString(html))
        soup.insert(0, pre)
        return soup, soup
    else:
        return _ExtractFromHtmlGeneric(url, html)
def _MungeHyphenate(root_tag):
    for text in root_tag.findAll(text=True):
        if text.findParent('pre'):
            continue
        text_parts = re.split(r'(&[^;]{2,6};)', text)
        new_text = []
        for text_part in text_parts:
            if not text_part:
                continue
            if '&' == text_part[0]:
                new_text.append(text_part)
            else:
                words = re.split(r'\s+', text_part)
                # ­ is a unicode soft hyphen here -- only two UTF-8 bytes, and
                # it doesn't clutter up the source view!
                words = [
                    u'­'.join(hyphenate.hyphenate_word(word)) for word in words
                ]
                new_text.append(' '.join(words))
        text.replaceWith(BeautifulSoup.NavigableString(''.join(new_text)))
Exemple #8
0
def SwfObjectFixup(soup):
    # SWFObject 1 style
    script_txts = soup.findAll('script_txt',
                               text=re.compile(r'\bnew SWFObject\b'))
    for script_txt in script_txts:
        m = re.search(r'new\s+SWFObject.*?\((.*)\)', str(script_txt))
        src, name, width, height, _, bgcolor = [
            x for _, x in re.findall(r"""(['"])(.*?)\1""", m.group(1))
        ]
        embed = BeautifulSoup.Tag(soup, 'embed')
        embed['src'] = src
        embed['name'] = name
        embed['width'] = width
        embed['height'] = height
        embed['bgcolor'] = bgcolor
        for m in re.findall(
                r"""\.\s*addParam\s*\(\s*(['"])(.*)\1\s*,\s*(['"])(.*)\3\s*\)""",
                str(script_txt)):
            embed[m[1]] = m[3]
        script_txt.parent.replaceWith(embed)
    def __init__(self, url=None, final_url=None, html=None):
        assert url, 'URL must be provided.'
        self.url = url

        if re.search(r'^https?://(docs|spreadsheets)\.google\.', url, re.I):
            raise UnsupportedRssError('skip google docs')

        if final_url or html:
            assert (final_url and html), ('If either is, both final_url and '
                                          'html must be provided')
            self.final_url = final_url
            self.html = html
        else:
            self.html, self.final_url, _ = util.Fetch(url)

        feed_url = self._DetectFeed()
        feed_url = re.sub(r'^feed://', 'http://', feed_url)

        self.feed = util.ParseFeedAtUrl(feed_url)
        if not self.feed:
            raise NoRssError('could not download/parse feed')

        self._FindEntry()

        self.content = util.PreCleanHtml(util.GetFeedEntryContent(self.entry))
        if not self.content:
            raise NoRssContentError('no content found')

        # Now, we've found content.  Check if it's legit.
        html = re.sub(r'<!--.*?-->', '', self.content)
        self.soup = BeautifulSoup.BeautifulSoup(html)
        for tag in self.soup.findAll('script'):
            tag.extract()
        text = self.soup.text
        if re.search(r'\[?\.\.\.\]?\s*$', text):
            raise NoRssContentError('trailing ellipsis')
        if len(text) < MIN_FEED_TEXT_LEN:
            raise NoRssContentError('text too short (%d)' % len(text))

        # To strip things out, really.
        patterns.Process(self.soup, url)