Exemple #1
0
    def _get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.
        
        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = beautiful_soup("<div/>")
        for sibling in best_candidate['elem'].parent.contents:
            if isinstance(sibling, NavigableString): continue
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = HashableElement(sibling)
            if sibling_key in candidates and \
                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.name == "p":
                link_density = self._get_link_density(sibling)
                node_content = sibling.string or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.div.append(sibling)
                
        return output
Exemple #2
0
 def process(self, base_url=None):
     self.html = self._remove_crufy_html(self.html)
     
     self.soup = beautiful_soup(self.html, self.logger)
     
     base_url = self.base_url or base_url
     if base_url is not None:
         self._fix_references(base_url)
         
     title = self.get_title(self.soup)
     body = self.get_body(self.soup)
     
     return title, body
Exemple #3
0
 def get_body(self, soup):
     for elem in soup.find_all(['script', 'link', 'style']):
         elem.extract()
     raw_html = unicode(soup.body or soup)
     cleaned = self._clean_attributes(raw_html)
     return beautiful_soup(cleaned)