def test_basic(self):
        html = load_regression_data('basic-multi-page.html')
        urldict = self._make_basic_urldict()
        fetcher = urlfetch.MockUrlFetch(urldict)
        options = {
                'url': 'http://basic.com/article.html',
                'multipage': True,
                'urlfetch': fetcher
                }
        doc = Document(html, **options)
        res = doc.summary_with_metadata()

        self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
        self.assertIn('Page 3', res.html, 'Should find the page 3 heading')

        expected_html = load_regression_data('basic-multi-page-expected.html')
        diff_html = htmldiff(expected_html, res.html)
        diff_doc = document_fromstring(diff_html)

        insertions = diff_doc.xpath('//ins')
        deletions = diff_doc.xpath('//del')

        if len(insertions) != 0:
            for i in insertions:
                print('unexpected insertion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')

        if len(deletions) != 0:
            for i in deletions:
                print('unexpected deletion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(
         sample,
         url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary()
     self.assertEqual('<html><body><div><div class', res[0:27])
 def test_si_sample_full_summary(self):
     """We should parse the doc and get a full summary with confidence"""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary_with_metadata(enclose_with_html_tag=False)
     self.assertTrue(hasattr(res, 'html'),
         'res should have an html attrib')
     self.assertTrue(hasattr(res, 'confidence'),
         'res should have an html attrib')
     self.assertTrue(hasattr(res, 'title'),
             'res should have an titile attrib')
     self.assertTrue(hasattr(res, 'short_title'),
         'res should have an short_title attrib')
     self.assertEqual('<div><div class="', res.html[0:17])
     self.assertTrue(res.confidence > 50,
         'The confidence score should be larger than 50: ' + str(res.confidence))
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary(enclose_with_html_tag=False)
     self.assertEqual('<div><div class="', res[0:17])
def process_article(article):
    sample = load_sample(article)
    doc = Document(sample)
    res = doc.summary()
    failed_msg = "Failed to process the article: " + article
    assert '<html><body><div><div class' == res[0:27], failed_msg