예제 #1
0
 def test_doc_no_scripts_styles(self):
     """Step #1 remove all scripts from the document"""
     doc = Article(load_snippet('document_scripts.html'))
     readable = doc._readable
     self.assertEqual(readable.findall(".//script"), [])
     self.assertEqual(readable.findall(".//style"), [])
     self.assertEqual(readable.findall(".//link"), [])
예제 #2
0
    def test_unlikely_hits(self):
        """Verify we wipe out things from our unlikely list."""
        doc = Article(load_snippet('test_readable_unlikely.html'))
        readable = doc._readable
        must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
                'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
                'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
                'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']

        want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

        for i in must_not_appear:
            # we cannot find any class or id with this value
            by_class = readable.find_class(i)

            for test in by_class:
                # if it's here it cannot have the must not class without the
                # want to appear class
                found = False
                for cls in test.get('class').split():
                    if cls in want_to_appear:
                        found = True
                self.assertTrue(found)

            by_ids = readable.get_element_by_id(i, False)
            if by_ids is not False:
                found = False
                for ids in test.get('id').split():
                    if ids in want_to_appear:
                        found = True
                self.assertTrue(found)
예제 #3
0
 def test_doc_no_scripts_styles(self):
     """Step #1 remove all scripts from the document"""
     doc = Article(load_snippet('document_scripts.html'))
     readable = doc._readable
     self.assertEqual(readable.findall(".//script"), [])
     self.assertEqual(readable.findall(".//style"), [])
     self.assertEqual(readable.findall(".//link"), [])
예제 #4
0
    def test_unlikely_hits(self):
        """Verify we wipe out things from our unlikely list."""
        doc = Article(load_snippet('test_readable_unlikely.html'))
        readable = doc._readable
        must_not_appear = [
            'comment', 'community', 'disqus', 'extra', 'foot', 'header',
            'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor',
            'ad-break', 'agegate', 'pagination'
            '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'
        ]

        want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

        for i in must_not_appear:
            # we cannot find any class or id with this value
            by_class = readable.find_class(i)

            for test in by_class:
                # if it's here it cannot have the must not class without the
                # want to appear class
                found = False
                for cls in test.get('class').split():
                    if cls in want_to_appear:
                        found = True
                self.assertTrue(found)

            by_ids = readable.get_element_by_id(i, False)
            if by_ids is not False:
                found = False
                for ids in test.get('id').split():
                    if ids in want_to_appear:
                        found = True
                self.assertTrue(found)
예제 #5
0
    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_only_content.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #6
0
    def test_find_body_exists(self):
        """If the document has a body, we store that as the readable html

        No sense processing anything other than the body content.

        """
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #7
0
    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_no_body.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #8
0
    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_only_content.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #9
0
    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_no_body.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #10
0
    def test_find_body_exists(self):
        """If the document has a body, we store that as the readable html

        No sense processing anything other than the body content.

        """
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc._readable.tag, 'div')
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
예제 #11
0
    def test_readin_with_base_url(self):
        """Passing a url should update links to be absolute links"""
        doc = OriginalDocument(load_snippet('document_absolute_url.html'),
                               url="http://blog.mitechie.com/test.html")
        self.assertTrue(str(doc).startswith(u'<html>'))

        # find the links on the page and make sure each one starts with out
        # base url we told it to use.
        links = doc.links
        self.assertEqual(len(links), 3)
        # we should have two links that start with our blog url
        # and one link that starts with amazon
        link_counts = defaultdict(int)
        for link in links:
            if link.get('href').startswith('http://blog.mitechie.com'):
                link_counts['blog'] += 1
            else:
                link_counts['other'] += 1

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)
예제 #12
0
    def test_readin_with_base_url(self):
        """Passing a url should update links to be absolute links"""
        doc = OriginalDocument(
            load_snippet('document_absolute_url.html'),
            url="http://blog.mitechie.com/test.html")
        self.assertTrue(str(doc).startswith(u'<html>'))

        # find the links on the page and make sure each one starts with out
        # base url we told it to use.
        links = doc.links
        self.assertEqual(len(links), 3)
        # we should have two links that start with our blog url
        # and one link that starts with amazon
        link_counts = defaultdict(int)
        for link in links:
            if link.get('href').startswith('http://blog.mitechie.com'):
                link_counts['blog'] += 1
            else:
                link_counts['other'] += 1

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)
예제 #13
0
 def test_readin_min_document(self):
     """Verify we can read in a min html document"""
     doc = OriginalDocument(load_snippet('document_min.html'))
     self.assertTrue(str(doc).startswith(u'<html>'))
     self.assertEqual(doc.title, 'Min Document Title')
예제 #14
0
 def test_several_links(self):
     """This doc has a 3 links with the majority of content."""
     doc = Article(load_snippet('document_absolute_url.html'))
     self.assertAlmostEqual(get_link_density(doc._readable),
                            0.349,
                            places=3)
예제 #15
0
 def test_small_doc_no_links(self):
     doc = Article(load_snippet('document_min.html'))
     assert 0 == get_link_density(doc._readable), "Still no link density"
예제 #16
0
 def test_several_links(self):
     """This doc has a 3 links with the majority of content."""
     doc = Article(load_snippet('document_absolute_url.html'))
     self.assertAlmostEqual(
             get_link_density(doc._readable), 0.349,
             places=3)
예제 #17
0
 def test_readin_min_document(self):
     """Verify we can read in a min html document"""
     doc = OriginalDocument(load_snippet('document_min.html'))
     self.assertTrue(str(doc).startswith(u'<html>'))
     self.assertEqual(doc.title, 'Min Document Title')
예제 #18
0
 def test_load_doc(self):
     """We get back an element tree from our original doc"""
     doc = Article(load_snippet('document_min.html'))
     # We get back the document as a div tag currently by default.
     self.assertEqual(doc._readable.tag, 'div')
예제 #19
0
 def test_no_br_allowed(self):
     """We convert all <br/> tags to <p> tags"""
     doc = OriginalDocument(load_snippet('document_min.html'))
     self.assertIsNone(doc.html.find('.//br'))
예제 #20
0
 def test_load_doc(self):
     """We get back an element tree from our original doc"""
     doc = Article(load_snippet('document_min.html'))
     # We get back the document as a div tag currently by default.
     self.assertEqual(doc._readable.tag, 'div')
예제 #21
0
 def test_small_doc_no_links(self):
     doc = Article(load_snippet('document_min.html'))
     assert 0 == get_link_density(doc._readable), "Still no link density"
예제 #22
0
 def test_no_br_allowed(self):
     """We convert all <br/> tags to <p> tags"""
     doc = OriginalDocument(load_snippet('document_min.html'))
     self.assertIsNone(doc.html.find('.//br'))