def _Munge(soup, tag, url): """Given a string of HTML content, munge it to be more pleasing.""" # In certain failure cases, we'll still get a string. Just use it. if isinstance(tag, basestring): return tag _MungeStripSiteSpecific(tag, url) _MungeStripBrsAfterPs(tag) _MungeStripRules(tag) _MungeStripEmpties(tag) tag = _MungeStripRootContainers(tag) _MungeStripLowScored(tag) _MungeStripAttrs(tag) _FixUrls(tag, url) _MungeImages(tag) _MungeHeaderDowngrade(tag) _MungeHyphenate(tag) _MungeNoscript(tag) tag = _MungeTransformEmbeds(soup, tag) # Serialize the tag, and apply full justification. if isinstance(tag, BeautifulSoup.BeautifulStoneSoup): # Wrap in a div, to have a tag to justify, if necessary. wrap = BeautifulSoup.Tag(soup, 'div') wrap.insert(0, tag) tag = wrap tag['style'] = 'text-align: justify;' return unicode(tag)
def _TransformBrsToParagraphsInner(soup, tag): next = tag while True: next = next.nextSibling if not next: return if isinstance(next, BeautifulSoup.Tag): if next.name == 'br': break else: return elif isinstance(next, BeautifulSoup.NavigableString): if not unicode(next).strip(): continue else: return contents = [] prev = tag while True: prev = prev.previousSibling if not prev: break if hasattr(prev, 'name') and prev.name in BR_TO_P_STOP_TAGS: break contents.insert(0, prev) newp = BeautifulSoup.Tag(soup, 'p') for i, newtag in enumerate(contents): newp.insert(i, newtag) next.extract() tag.replaceWith(newp)
def _MungeTransformEmbeds(soup, root_tag): for tag in util.FindEmbeds(root_tag): try: w, h = util.TagSize(root_tag) except TypeError: w = 600 h = 400 link = BeautifulSoup.Tag(soup, 'a') link['href'] = 'data:text/html;base64,' + base64.b64encode( '<body style="margin:0;">%s</body>' % unicode(tag)) link['rel'] = 'embedded_media' link['embed_width'] = w link['embed_height'] = h img = BeautifulSoup.Tag(soup, 'img') img['src'] = 'http://readability-api.appspot.com/embedded_media.png' img['width'] = '128' img['height'] = '128' link.insert(0, img) if tag == root_tag: return link tag.replaceWith(link) return root_tag
def OEmbedFixup(soup): oembed_links = soup.findAll( 'a', {'onclick': re.compile(r'^oEmbedManagerVideoLoader')}) for oembed_link in oembed_links: cont = oembed_link.parent embed = cont.find('iframe') if not embed: ta = cont.find('textarea') if not ta: return s = BeautifulSoup.BeautifulSoup(ta.text) embed = s.find('iframe') embed['src'] = re.sub(r'\?.*', '', embed['src']) div = BeautifulSoup.Tag(soup, 'div') div.insert(0, embed) cont.replaceWith(div)
def SwfObjectFixup(soup): # SWFObject 1 style script_txts = soup.findAll('script_txt', text=re.compile(r'\bnew SWFObject\b')) for script_txt in script_txts: m = re.search(r'new\s+SWFObject.*?\((.*)\)', str(script_txt)) src, name, width, height, _, bgcolor = [ x for _, x in re.findall(r"""(['"])(.*?)\1""", m.group(1)) ] embed = BeautifulSoup.Tag(soup, 'embed') embed['src'] = src embed['name'] = name embed['width'] = width embed['height'] = height embed['bgcolor'] = bgcolor for m in re.findall( r"""\.\s*addParam\s*\(\s*(['"])(.*)\1\s*,\s*(['"])(.*)\3\s*\)""", str(script_txt)): embed[m[1]] = m[3] script_txt.parent.replaceWith(embed)
def ExtractFromHtml(url, html): """Given a string of HTML, remove nasty bits, score and pick bit to keep.""" if re.search(r'^http://(www\.)?reddit\.com/.*/comments/', url, re.I): strainer = BeautifulSoup.SoupStrainer( attrs={'class': re.compile(r'thing.*link|usertext border')}) soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer) body = soup.find(attrs={'class': re.compile(r'\busertext-body\b')}) if not body: body = soup.find('a', attrs={'class': re.compile(r'\btitle\b')}) body = body and body.text or soup return soup, body elif re.search(r'^http://(www\.)?xkcd\.com/\d+', url, re.I): soup = BeautifulSoup.BeautifulSoup(html) img = soup.find(alt=True, title=True) cont = img.parent.parent for tag in cont.findAll(('br', 'div')): util.Strip(tag) return soup, cont elif re.search(r'^http://groups\.google\.com/', url, re.I): strainer = BeautifulSoup.SoupStrainer(attrs={'class': 'maincontbox'}) soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer) return _ExtractFromHtmlGeneric(url, unicode(soup)) elif re.search(r'^http://(www\.)?nytimes\.com/', url, re.I): soup = BeautifulSoup.BeautifulSoup(html) bodies = soup.findAll(attrs={'class': 'articleBody'}) if bodies: # Put the first "article body" contents into the second -- for some # reason NYT splits the lead-in text into its own "body". while bodies[0].contents: bodies[1].insert(0, bodies[0].contents[-1]) return _ExtractFromHtmlGeneric(url, unicode(soup)) elif re.search(r'\.txt(\?|$)', url, re.I): soup = BeautifulSoup.BeautifulSoup() pre = BeautifulSoup.Tag(soup, 'pre') pre.insert(0, BeautifulSoup.NavigableString(html)) soup.insert(0, pre) return soup, soup else: return _ExtractFromHtmlGeneric(url, html)