def get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = parse("<div/>") for sibling in best_candidate['elem'].getparent().getchildren(): #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text append = False if sibling is best_candidate['elem']: append = True sibling_key = sibling #HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": link_density = self.get_link_density(sibling) node_content = sibling.text or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: output.append(sibling) if output is not None: output.append(best_candidate['elem']) return output
def _html(self, force=False): if force or self.html is None: try: self.html = parse(self.input, self.options['url'], notify=self.notify) except: return None return self.html
def get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2]) output = parse("<div/>") for sibling in best_candidate["elem"].parent.contents: if isinstance(sibling, NavigableString): continue append = False if sibling is best_candidate["elem"]: append = True sibling_key = HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]["content_score"] >= sibling_score_threshold: append = True if sibling.name == "p": link_density = self.get_link_density(sibling) node_content = sibling.string or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search("\.( |$)", node_content): append = True if append: output.append(sibling) if not output: output.append(best_candidate) return output
def _html(self, force=False): if force or self.html is None: cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, frames=False, forms=False, annoying_tags=False, remove_tags=None, remove_unknown_tags=False, safe_attrs_only=False) self.html = parse(cleaner.clean_html(self.input), self.options['url'], notify=self.notify) return self.html
def _html(self, force=False): if force or self.html is None: self.html = parse(self.input, self.options["url"], notify=self.notify) return self.html