def auto_excerpt(self): """ Attempts to detect the text of this page (ignoring all navigation and other clutter), returning a list of strings. Each string represents a paragraph. """ from ebdata.textmining.treeutils import make_tree tree = make_tree(self.html) if self.seed.rss_full_entry: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) else: if self.seed.strip_noise: from ebdata.templatemaker.clean import strip_template try: html2 = self.companion_page().html except IndexError: pass else: tree2 = make_tree(html2) strip_template(tree, tree2) if self.seed.guess_article_text: from ebdata.templatemaker.articletext import article_text paras = article_text(tree) else: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) return paras
def assertStrips(self, html1, html2, expected, num_removals, check_ids=False): """ Asserts that strip_template(html1, html2) will result in the expected HTML string, and that the return value is num_removals. """ # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. tree1 = document_fromstring('<html><body>%s</body></html>' % html1) tree2 = document_fromstring('<html><body>%s</body></html>' % html2) expected = '<html><body>%s</body></html>' % expected got_removals = strip_template(tree1, tree2, check_ids=check_ids) got_tree = etree.tostring(tree1, method='html') self.assertEqual(got_tree, expected) self.assertEqual(got_removals, num_removals)