def _get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = beautiful_soup("<div/>") for sibling in best_candidate['elem'].parent.contents: if isinstance(sibling, NavigableString): continue append = False if sibling is best_candidate['elem']: append = True sibling_key = HashableElement(sibling) if sibling_key in candidates and \ candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.name == "p": link_density = self._get_link_density(sibling) node_content = sibling.string or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: output.div.append(sibling) return output
def process(self, base_url=None): self.html = self._remove_crufy_html(self.html) self.soup = beautiful_soup(self.html, self.logger) base_url = self.base_url or base_url if base_url is not None: self._fix_references(base_url) title = self.get_title(self.soup) body = self.get_body(self.soup) return title, body
def get_body(self, soup): for elem in soup.find_all(['script', 'link', 'style']): elem.extract() raw_html = unicode(soup.body or soup) cleaned = self._clean_attributes(raw_html) return beautiful_soup(cleaned)