def _readable(self): """The readable parsed article""" if self.candidates: LOG.debug('Candidates found:') pp = PrettyPrinter(indent=2) # cleanup by removing the should_drop we spotted. [n.drop_tree() for n in self._should_drop if n.getparent() is not None] # right now we return the highest scoring candidate content by_score = sorted([c for c in self.candidates.values()], key=attrgetter('content_score'), reverse=True) LOG.debug(pp.pformat(by_score)) # since we have several candidates, check the winner's siblings # for extra content winner = by_score[0] LOG.debug('Selected winning node: ' + str(winner)) updated_winner = check_siblings(winner, self.candidates) LOG.debug('Begin final prep of article') updated_winner.node = prep_article(updated_winner.node) if updated_winner.node is not None: doc = build_base_document(updated_winner.node, self.fragment) else: LOG.warning('Had candidates but failed to find a cleaned winning doc.') doc = self._handle_no_candidates() else: LOG.warning('No candidates found: using document.') LOG.debug('Begin final prep of article') doc = self._handle_no_candidates() return doc
def _readable(self): """The readable parsed article""" doc = self.orig.html # cleaning doesn't return, just wipes in place html_cleaner(doc) doc = drop_tag(doc, "noscript", "iframe") doc = transform_misused_divs_into_paragraphs(doc) candidates, should_drop = find_candidates(doc) if candidates: LOG.debug("Candidates found:") pp = PrettyPrinter(indent=2) # right now we return the highest scoring candidate content by_score = sorted([c for c in candidates.values()], key=attrgetter("content_score"), reverse=True) LOG.debug(pp.pformat(by_score)) # since we have several candidates, check the winner's siblings # for extra content winner = by_score[0] LOG.debug("Selected winning node: " + str(winner)) updated_winner = check_siblings(winner, candidates) LOG.debug("Begin final prep of article") updated_winner.node = prep_article(updated_winner.node) doc = build_base_document(updated_winner.node, self.fragment) else: LOG.warning("No candidates found: using document.") LOG.debug("Begin final prep of article") # since we've not found a good candidate we're should help this # cleanup by removing the should_drop we spotted. [n.drop_tree() for n in should_drop] doc = prep_article(doc) doc = build_base_document(doc, self.fragment) return doc
def _handle_no_candidates(self): """If we fail to find a good candidate we need to find something else.""" # since we've not found a good candidate we're should help this if self.doc is not None and len(self.doc): # cleanup by removing the should_drop we spotted. [n.drop_tree() for n in self._should_drop if n.getparent() is not None] doc = prep_article(self.doc) doc = build_base_document(doc, self.fragment) else: LOG.warning('No document to use.') doc = build_error_document(self.fragment) return doc
def _handle_no_candidates(self): """If we fail to find a good candidate we need to find something else.""" # since we've not found a good candidate we're should help this if self.doc is not None and len(self.doc): # cleanup by removing the should_drop we spotted. [ n.drop_tree() for n in self._should_drop if n.getparent() is not None ] doc = prep_article(self.doc) doc = build_base_document(doc, self.fragment) else: LOG.warning('No document to use.') doc = build_error_document(self.fragment) return doc
def _readable(self): """The readable parsed article""" if self.candidates: LOG.debug('Candidates found:') pp = PrettyPrinter(indent=2) # cleanup by removing the should_drop we spotted. [ n.drop_tree() for n in self._should_drop if n.getparent() is not None ] # right now we return the highest scoring candidate content by_score = sorted([c for c in self.candidates.values()], key=attrgetter('content_score'), reverse=True) LOG.debug(pp.pformat(by_score)) # since we have several candidates, check the winner's siblings # for extra content winner = by_score[0] LOG.debug('Selected winning node: ' + str(winner)) updated_winner = check_siblings(winner, self.candidates) LOG.debug('Begin final prep of article') updated_winner.node = prep_article(updated_winner.node) if updated_winner.node is not None: doc = build_base_document(updated_winner.node, self.fragment) else: LOG.warning( 'Had candidates but failed to find a cleaned winning doc.') doc = self._handle_no_candidates() else: LOG.warning('No candidates found: using document.') LOG.debug('Begin final prep of article') doc = self._handle_no_candidates() return doc