def test_basic(self): html = load_regression_data('basic-multi-page.html') urldict = self._make_basic_urldict() fetcher = urlfetch.MockUrlFetch(urldict) options = { 'url': 'http://basic.com/article.html', 'multipage': True, 'urlfetch': fetcher } doc = Document(html, **options) res = doc.summary_with_metadata() self.assertIn('Page 2', res.html, 'Should find the page 2 heading') self.assertIn('Page 3', res.html, 'Should find the page 3 heading') expected_html = load_regression_data('basic-multi-page-expected.html') diff_html = htmldiff(expected_html, res.html) diff_doc = document_fromstring(diff_html) insertions = diff_doc.xpath('//ins') deletions = diff_doc.xpath('//del') if len(insertions) != 0: for i in insertions: print('unexpected insertion: %s' % i.xpath('string()')) self.fail('readability result does not match expected') if len(deletions) != 0: for i in deletions: print('unexpected deletion: %s' % i.xpath('string()')) self.fail('readability result does not match expected')
def test_duplicate(self): html = load_regression_data('duplicate-page-duplicate.html') page = r.fragment_fromstring(html) self.assertTrue(r.is_suspected_duplicate(self._article, page))
def setUp(self): super(TestIsSuspectedDuplicate, self).setUp() html = load_regression_data('duplicate-page-article.html') self._article = r.fragment_fromstring(html)
def _test_page(self, url, html_path, expected): html = load_regression_data(html_path) doc = r.parse(html, url) parsed_urls = {url} actual = r.find_next_page_url(parsed_urls, url, doc) self.assertEqual(expected, actual)