def generate_table(summary): soup = BeautifulSoup() new_tag_table = Tag(soup, "table") new_tag_table["border"] = 1 new_tag_table["cellspacing"] = 0 new_tag_table["cellpadding"] = 0 new_tag_table["bordercolordark"] = "#000000" new_tag_table["cellspacing"] = "#ffffff" soup.append(new_tag_table) new_Tag_tr = Tag(soup, "tr") new_Tag_tr["bgcolor"] = "#0072E3" new_tag_table.append(new_Tag_tr) for i in ["TestSuite", "Passed", "Failed", "Total"]: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(i) new_Tag_tr.append(new_Tag_td) for i in summary: new_Tag_tr = Tag(soup, "tr") new_tag_table.append(new_Tag_tr) for j in i: new_Tag_td = Tag(soup, "td") new_Tag_td.string = str(j) new_Tag_tr.append(new_Tag_td) print str(soup.prettify()) return str(soup.prettify())
def get_slides(args): contents = get_file_contents(args.file) soup = BeautifulSoup(markdown(contents)) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(args.file) head.append(title) link = Tag(hsoup, 'link') link['rel'] = 'stylesheet' link['type'] = 'text/css' if args.offline: link['href'] = 'default.css' else: link[ 'href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css' head.append(link) script = Tag(hsoup, 'script') if args.offline: script['src'] = 'html5slides.js' else: script[ 'src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js' head.append(script) html.append(head) body = Tag(hsoup, 'body') body['style'] = 'display:none' section = Tag(hsoup, 'section') section['class'] = 'slides layout-regular template-default' body.append(section) elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) article = Tag(hsoup, 'article') section.append(article) for element in elements: if element.name == 'hr': article = Tag(hsoup, 'article') section.append(article) else: article.append(element) html.append(body) return prettify(html)
def get_slides(args): contents = get_file_contents(args.file) soup = BeautifulSoup(markdown(contents)) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(args.file) head.append(title) link = Tag(hsoup, 'link') link['rel'] = 'stylesheet' link['type'] = 'text/css' if args.offline: link['href'] = 'default.css' else: link['href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css' head.append(link) script = Tag(hsoup, 'script') if args.offline: script['src'] = 'html5slides.js' else: script['src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js' head.append(script) html.append(head) body = Tag(hsoup, 'body') body['style'] = 'display:none' section = Tag(hsoup, 'section') section['class'] = 'slides layout-regular template-default' body.append(section) elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) article = Tag(hsoup, 'article') section.append(article) for element in elements: if element.name == 'hr': article = Tag(hsoup, 'article') section.append(article) else: article.append(element) html.append(body) return prettify(html)
def body_insertion(content, insertion, end=False): """Insert an HTML content into the body HTML node""" insertion = BeautifulSoup(insertion) soup = BeautifulSoup(content) if soup.body and end: soup.body.append(insertion) elif soup.body: soup.body.insert(0, insertion) elif not soup.body and end: soup.append(insertion) elif not soup.body: soup.insert(0, insertion) if USE_PRETTIFY: return soup.prettify() else: return soup.renderContents()
def body_insertion(content, insertion, end=False): """Insert an HTML content into the body HTML node""" if not content.startswith('<body'): content = u'<body>%s</body>' % smart_text(content) soup = BeautifulSoup(content) insertion = BeautifulSoup(insertion) if end: soup.append(insertion) else: soup.body.insert(0, insertion) if USE_PRETTIFY: text = soup.prettify() else: text = soup.renderContents() text = smart_text(text) if USE_PREMAILER: site = Site.objects.get_current() return premailer.transform(smart_text(text), base_url='http://%s' % site.domain) else: return text
def read_connections(): if _connections: f = open(_connections, 'r') connections = BeautifulSoup(f.read()) f.close() else: connections = [] if _settings['vpn1id'] and len(_settings['vpn1id']) > 0: vpn = {'id': _settings['vpn1id'], 'host': _settings['vpn1host'], 'port': _settings['vpn1port'], 'proto': proto_enum_to_string(_settings['vpn1proto']), 'cipher': _settings['vpn1cipher'], 'delay': _settings['vpn1delay']} connections.append(vpn) if _settings['vpn2id'] and len(_settings['vpn2id']) > 0: vpn = {'id': _settings['vpn2id'], 'host': _settings['vpn2host'], 'port': _settings['vpn2port'], 'proto': proto_enum_to_string(_settings['vpn2proto']), 'cipher': _settings['vpn2cipher'], 'delay': _settings['vpn2delay']} connections.append(vpn) return connections
def wikisnip(url): html = wget(url) soup = BeautifulSoup(html) div = soup.find('div', {'id': 'bodyContent'}) snip = BeautifulSoup('') for node in div.childGenerator(): if (isinstance(node, basestring) or node.name.lower() in ["table", "script"] or node.get('id') in ["siteSub", "contentSub", "jump-to-nav"] or node.get('class') in ['dablink', 'toclimit-2']): continue if node.name.lower() == "h2": break snip.append(node) for a in snip.findAll('a'): if a.get('href'): a['href'] = urlparse.urljoin(url, a['href']) return snip
def crear_cuerpo_mail(tb): cuerpo_mail = BeautifulSoup() html = Tag(cuerpo_mail, "html") cuerpo_mail.append(html) cuerpo_mail.append("Enlaces:<br>\n") for fuente in sorted(tb): h3 = Tag(cuerpo_mail, "h3") cuerpo_mail.append(h3) h3.append(fuente) ul = Tag(cuerpo_mail, "ul") cuerpo_mail.append(ul) lista_titulos_fuente = tb[fuente] for i in lista_titulos_fuente: dict_enlace = i li = Tag(cuerpo_mail, "li") li.append(dict_enlace['titulo'].decode('utf-8')+" - <a href='"+dict_enlace['url'].decode('utf-8')+"'>link</a>") cuerpo_mail.append(li) return str(cuerpo_mail)
def parse(self): soup = BeautifulSoup(self.content) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(self.title) head.append(title) link1 = Tag(hsoup, 'link') link1['rel'] = 'stylesheet' link1['type'] = 'text/css' link1['href'] = 'http://imakewebthings.com/deck.js/core/deck.core.css' head.append(link1) link2 = Tag(hsoup, 'link') link2['rel'] = 'stylesheet' link2['type'] = 'text/css' link2['href'] = 'http://imakewebthings.com/deck.js/themes/style/swiss.css' head.append(link2) link3 = Tag(hsoup, 'link') link3['rel'] = 'stylesheet' link3['type'] = 'text/css' link3['href'] = 'http://yandex.st/highlightjs/7.3/styles/monokai_sublime.min.css' head.append(link3) link3 = Tag(hsoup, 'link') link3['rel'] = 'stylesheet' link3['type'] = 'text/css' link3['href'] = 'http://imakewebthings.com/deck.js/themes/transition/fade.css' head.append(link3) script1 = Tag(hsoup, 'script') script1['src'] = 'http://imakewebthings.com/deck.js/jquery-1.7.min.js' head.append(script1) script2 = Tag(hsoup, 'script') script2['src'] = 'http://imakewebthings.com/deck.js/core/deck.core.js' head.append(script2) script3 = Tag(hsoup, 'script') script3['src'] = 'http://yandex.st/highlightjs/7.3/highlight.min.js' head.append(script3) script3 = Tag(hsoup, 'script') script3['type'] = 'text/javascript' script3.setString(DECK_JS) head.append(script3) html.append(head) body = Tag(hsoup, 'body') body['class'] = 'deck-container' elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) section = Tag(hsoup, 'section') section['class'] = 'slide' body.append(section) for element in elements: if element.name == 'hr': section = Tag(hsoup, 'section') section['class'] = 'slide' body.append(section) else: section.append(element) html.append(body) self.html_content = html
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage': [], 'help': [], 'seealso': []}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname, _tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname, _tmpdir) cfgfile = config.read(os.path.join(_tmpdir, mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction, 'usage') help = config.get(subfunction, 'help') seealso = config.get(subfunction, 'seealso') headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_", "", subfunction) try: resp = mech.open(baseurl + title + "/edit") except HTTPError, e: sys.exit( "retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit( "retrieving old body text failed while processing page: " + baseurl + title + '/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class': "subfct", 'id': mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0, NavigableString(text)) # insert the new div soup.insert(len(soup), subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class': "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del (div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction, cheesesoup.renderContents())
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage':[], 'help':[], 'seealso':[]}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname,_tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname,_tmpdir) cfgfile = config.read(os.path.join(_tmpdir,mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction,'usage') help = config.get(subfunction,'help') seealso = config.get(subfunction,'seealso') headline = '===[['+subfunction+' '+mexname+'(\''+subfunction+'\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_","",subfunction) try: resp = mech.open(baseurl+title+"/edit") except HTTPError, e: sys.exit("retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit("retrieving old body text failed while processing page: " + baseurl + title +'/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class' : "subfct", 'id' : mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0,NavigableString(text)) # insert the new div soup.insert(len(soup),subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class' : "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del(div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction,cheesesoup.renderContents())
def Extract( html, context ): """ handle express articles, including blogs """ art = context # cheesiness - kill everything from comments onward.. cullpats = [ re.compile( "<a name=\"comments\">.*", re.DOTALL ), # when comments disabled, it just shows a message re.compile( r"""<img src="http://images[.]\w+[.]co[.]uk/img/comments/nocomments[.](gif|png)".*""", re.DOTALL ) ] for cullpat in cullpats: html = cullpat.sub( "", html ) # express claims to be iso-8859-1, but it seems to be windows-1252 really soup = BeautifulSoup( html, fromEncoding = 'windows-1252' ) wrapdiv = soup.find( 'div', {'class':'articleWrapper'} ) if wrapdiv is None: # for blogs(?) wrapdiv = soup.find( 'td', {'class':'contentcontainer'} ) missing = soup.find( 'p', text=u"The article you are looking for does not exist. It may have been deleted." ) if missing: if 'title' in art: ukmedia.DBUG2( "IGNORE missing article '%s' (%s)\n" % (art['title'],art['srcurl'] ) ) else: ukmedia.DBUG2( "IGNORE missing article (%s)\n" % ( art['srcurl'] ) ) return None headline = wrapdiv.find( 'h1', { 'class':'articleHeading' } ) art['title'] = headline.renderContents( None ) art['title'] = ukmedia.FromHTML( art['title' ] ) if art['title'].upper() == art['title']: art['title'] = ukmedia.UncapsTitle( art['title'] ) # don't like ALL CAPS HEADLINES! introcopypara = wrapdiv.find( 'p', {'class': re.compile(r'\bintrocopy\b') } ) art['description'] = ukmedia.FromHTMLOneLine( introcopypara.renderContents(None) ) datepara = wrapdiv.find( 'p', {'class':'date'} ) if datepara is None: #"<span class="date">Monday October 27 2008 <b> byEmily Garnham for express.co.uk</b>" datespan = wrapdiv.find( 'span', {'class':'date'} ) bylineb = datespan.find( 'b' ) if bylineb is not None: art['byline'] = ukmedia.FromHTMLOneLine( bylineb.renderContents(None).strip() ) art['byline'] = re.sub( '([bB]y)([A-Z])', r'\1 \2', art['byline'] ) bylineb.extract() else: if 'blog' in art['srcurl']: # blogs(?) have slightly different date/byline layout bylineb = wrapdiv.b art['byline'] = ukmedia.FromHTMLOneLine( bylineb.renderContents(None).strip() ) else: art['byline'] = u'' art['pubdate'] = ukmedia.ParseDateTime( datespan.renderContents(None).strip() ) datespan.extract() else: art['pubdate'] = ukmedia.ParseDateTime( datepara.renderContents(None).strip() ) bylineh4 = wrapdiv.find( 'h4' ) if bylineh4: art['byline'] = ukmedia.FromHTML(bylineh4.renderContents(None)) else: # for some sections, try extracting a journo from the description... # (Express usually has names IN ALL CAPS, which the byline-o-matic # misses, so we'll turn anything likely-looking into titlecase # first). art['byline'] = u'' if art['srcurl'].find('/travel/') != -1 or art['srcurl'].find('/motoring/') != -1: desc = ukmedia.DecapNames( art['description'] ) art['byline'] = ukmedia.ExtractAuthorFromParagraph( desc ) #comments art['commentlinks'] = [] comment_cnt_pat = re.compile( "Have your say\s*[(](\d+)[)]" ) num_comments = None comment_url = None for marker in soup.findAll( text=comment_cnt_pat ): if marker.parent.name != 'a': continue m = comment_cnt_pat.search( marker ) if m: num_comments = int( m.group(1) ) comment_url = urlparse.urljoin( art['srcurl'], '#comments' ) art['commentlinks'].append( {'num_comments':num_comments, 'comment_url':comment_url} ) break # just the one. #images art['images'] = [] for imgdiv in soup.findAll( 'div', {'class':'articleFirstImage'} ): img = imgdiv.find('img') im = { 'url': img['src'].strip(), 'caption':u'', 'credit': u'' } if im['url'].endswith( "/missingimage.gif" ): continue # find caption para # eg class="articleFirstImageCaption" capp = imgdiv.find('p',{'class':re.compile('caption$',re.IGNORECASE) } ) if capp: im['caption'] = ukmedia.FromHTMLOneLine( capp.renderContents(None) ).strip() art['images'].append(im) # cruft removal - mismatched tags means that cruft can get drawn into # story paragraphs... sigh... # cruft = wrapdiv.find('a', {'name':'comments'} ) # if cruft: # # delete _everything_ from the comments onward # n = cruft.next # cruft.extract() # cruft = n for cruft in wrapdiv.findAll('object'): cruft.extract() for cruft in wrapdiv.findAll('div',{'class':'right'}): cruft.extract() for cruft in wrapdiv.findAll('form' ): # (search form etc ) cruft.extract() for cruft_url_pat in ( re.compile("/creditadvice$"),re.compile("/money$") ): for cruft in wrapdiv.findAll( 'a', href=cruft_url_pat ): cruft.extract() # OK to build up text body now! textpart = BeautifulSoup() textpart.insert( len(textpart.contents), introcopypara ) #for para in wrapdiv.findAll( 'p', ): #{'class':'storycopy'} ): # sigh... sometimes express articles have nested paras, without the # "storycopy" class. probably due to cutting and pasting from another # source... for p in wrapdiv.findAll( 'p', {'class':'storycopy'} ): p.extract() textpart.append( p ) content = textpart.prettify( None ) content = ukmedia.DescapeHTML( content ) content = ukmedia.SanitiseHTML( content ) art['content'] = content if art['description'] == u'': art['description'] = ukmedia.FirstPara( content ) return art
def get_article_text(self, soup): orig_html = soup.renderContents() body = soup.body if not body: raise ReadabilityException() if self.FLAG_STRIP_UNLIKELYS in self.flags: for node in body.findAll(True, attrs={"id":UNLIKELY_CANDIDATES}): if (node.get("class") and OK_MAYBE_ITS_A_CANDIDATE.search(node["class"])) or \ OK_MAYBE_ITS_A_CANDIDATE.search(node["id"]): continue node.extract() for node in body.findAll(True, attrs={"class":UNLIKELY_CANDIDATES}): if OK_MAYBE_ITS_A_CANDIDATE.search(node["class"]) or \ (node.get("id") and OK_MAYBE_ITS_A_CANDIDATE.search(node["id"])): continue node.extract() # Replace div with paragraphs for div in body.findAll("div"): if not len(div.findAll(DIV_TO_P_ELEMENTS)): div.name = "p" nodes_to_score = body.findAll(re.compile("^p|td|pre$", re.I)) # Loop through all paragraphs, and assign a score to them based on how content-y they look. # Then add their score to their parent node. # # A score is determined by things like number of commas, class names, etc. Maybe eventually link density. candidates = [] for node in nodes_to_score: parent = node.parent if not parent: continue grand_parent = parent and parent.parent inner_text = self.get_inner_text(node) # If this paragraph is less than 25 characters, don't even count it. if len(inner_text) < 25: continue # Initialize readability data for the parent. if not getattr(parent, "readability", None): self.initialize_node(parent) candidates.append(parent) # Initialize readability data for the grandparent. if grand_parent and not getattr(grand_parent, "readability", None): self.initialize_node(grand_parent) candidates.append(grand_parent) content_score = 0 # Add a point for the paragraph itself as a base. content_score += 1 # Add points for any commas within this paragraph content_score += inner_text.count(",") # For every 100 characters in this paragraph, add another point. Up to 3 points. content_score += min(len(inner_text) / 100, 3) # Add the score to the parent. The grandparent gets half. parent.readability["content_score"] += content_score if grand_parent: grand_parent.readability["content_score"] += content_score / 2 # After we've calculated scores, loop through all of the possible candidate nodes we found # and find the one with the highest score. top_candidate = None for candidate in candidates: # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. candidate.readability["content_score"] = candidate.readability["content_score"] * (1 - self.get_link_density(candidate)) if not top_candidate or top_candidate.readability["content_score"] < candidate.readability["content_score"]: top_candidate = candidate # If we still have no top candidate, just use the body as a last resort. # We also have to copy the body node so it is something we can modify. if not top_candidate: top_candidate = Tag(soup, "div") for c in body: top_candidate.append(c.extract()) body.append(top_candidate) self.initialize_node(top_candidate) article = BeautifulSoup("<div></div>").div # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max(10, top_candidate.readability["content_score"] * 0.2) top_candidate_class = top_candidate.get("class", None) for sibling_node in top_candidate.parent: if not isinstance(sibling_node, Tag): continue append = False if sibling_node == top_candidate: append = True else: content_bonus = 0 # Give a bonus if sibling nodes and top candidates have the example same classname if top_candidate_class and top_candidate_class == sibling_node.get("class", None): content_bonus += top_candidate.readability["content_score"] * 0.2 if getattr(sibling_node, "readability", None) and (sibling_node.readability["content_score"] + content_bonus) >= sibling_score_threshold: append = True if sibling_node.name == "p": link_density = self.get_link_density(sibling_node) node_content = self.get_inner_text(sibling_node) node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length <= 80 and link_density == 0 and re.search(r"\.( |$)", node_content): append = True if append: if sibling_node.name not in ("div", "p"): # We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. sibling_node.name = "div" # To ensure a node does not interfere with readability styles, remove its classnames if sibling_node["class"]: del sibling_node["class"] article.append(sibling_node) if len(article.renderContents()) < 250: soup = BeautifulSoup(orig_html) if self.FLAG_STRIP_UNLIKELYS in self.flags: self.flags.remove(self.FLAG_STRIP_UNLIKELYS) return self.get_article_text(soup) elif self.FLAG_WEIGHT_CLASSES in self.flags: self.flags.remove(self.FLAG_WEIGHT_CLASSES) return self.get_article_text(soup) elif self.FLAG_CLEAN_CONDITIONALLY in self.flags: self.flags.remove(self.FLAG_CLEAN_CONDITIONALLY) return self.get_article_text(soup) else: raise ReadabilityException() self.prepare_article(article) return article.renderContents(encoding=None)