def get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = parse("<div/>") for sibling in best_candidate['elem'].parent.contents: if isinstance(sibling, NavigableString): continue append = False if sibling is best_candidate['elem']: append = True sibling_key = HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.name == "p": link_density = self.get_link_density(sibling) node_content = sibling.string or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: output.append(sibling) if not output: output.append(best_candidate) return output
def _html(self, force=False): if force or self.html is None: self.html = parse(self.input, self.options['url'], notify=self.notify) return self.html