Esempio n. 1
0
    def test_extract_body(self):
        """Test extracting the body text.

        """
        html = '<html><body><p>Hello  World</p></body></html>'
        self.assertEqual(htmltotext.extract(html).title, u'')
        self.assertEqual(htmltotext.extract(html).description, u'')
        self.assertEqual(htmltotext.extract(html).content, u'Hello World')
Esempio n. 2
0
    def test_extract_body(self):
        """Test extracting the body text.

        """
        html = '<html><body><p>Hello  World</p></body></html>'
        self.assertEqual(htmltotext.extract(html).title, u'')
        self.assertEqual(htmltotext.extract(html).description, u'')
        self.assertEqual(htmltotext.extract(html).content, u'Hello World\n\n')
Esempio n. 3
0
    def test_meta_description(self):
        """Test supplying a meta description tag.

        """
        html = '<meta name="robots" content="noindex"/><title>foo</title><body>body</body>'
        self.assertEqual(htmltotext.extract(html).title, u'')
        self.assertEqual(htmltotext.extract(html).description, u'')
        self.assertEqual(htmltotext.extract(html).keywords, u'')
        self.assertEqual(htmltotext.extract(html).indexing_allowed, False)
Esempio n. 4
0
    def test_meta_description(self):
        """Test supplying a meta description tag.

        """
        html = '<meta name="robots" content="noindex"/><title>foo</title><body>body</body>'
        self.assertEqual(htmltotext.extract(html).title, u'')
        self.assertEqual(htmltotext.extract(html).description, u'')
        self.assertEqual(htmltotext.extract(html).keywords, u'')
        self.assertEqual(htmltotext.extract(html).indexing_allowed, False)
Esempio n. 5
0
    def test_meta_content_type(self):
        """Test supplying a meta http-equiv tag to set the character set
        incorrectly.

        Will raise an error when trying to convert the output to unicode.

        """
        html = '<meta http-equiv="content-type" content="charset=utf8"/><title>foo\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo')
        self.assertEqual(htmltotext.extract(html).badly_encoded, True)
Esempio n. 6
0
    def test_meta_description(self):
        """Test supplying a meta description tag.

        """
        html = '<meta name="description" content="desc"/><title>foo</title><body>body</body>'
        self.assertEqual(htmltotext.extract(html).title, u'foo')
        self.assertEqual(htmltotext.extract(html).description, u'desc')
        self.assertEqual(htmltotext.extract(html).keywords, u'')
        self.assertEqual(htmltotext.extract(html).indexing_allowed, True)
        self.assertEqual(htmltotext.extract(html).badly_encoded, False)
Esempio n. 7
0
    def test_meta_content_type(self):
        """Test supplying a meta http-equiv tag to set the character set
        incorrectly.

        Will raise an error when trying to convert the output to unicode.

        """
        html = '<meta http-equiv="content-type" content="charset=utf8"/><title>foo\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo')
        self.assertEqual(htmltotext.extract(html).badly_encoded, True)
Esempio n. 8
0
    def test_meta_description(self):
        """Test supplying a meta description tag.

        """
        html = '<meta name="description" content="desc"/><title>foo</title><body>body</body>'
        self.assertEqual(htmltotext.extract(html).title, u'foo')
        self.assertEqual(htmltotext.extract(html).description, u'desc')
        self.assertEqual(htmltotext.extract(html).keywords, u'')
        self.assertEqual(htmltotext.extract(html).indexing_allowed, True)
        self.assertEqual(htmltotext.extract(html).badly_encoded, False)
Esempio n. 9
0
 def test_default_charset(self):
     """Test a conversion supplying a string, which should use the default
     character set.
     
     """
     html = '<title>foo\xa3</title>'
     self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 10
0
 def test_default_charset(self):
     """Test a conversion supplying a string, which should use the default
     character set.
     
     """
     html = '<title>foo\xa3</title>'
     self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 11
0
 def test_entity_decode(self):
     """Tests decoding of entities"""
     html = '<body>This &nbsp; &nbsp; has some extra spaces. and has &#x0022;quotes&#34;</body>'
     parsed = htmltotext.extract(html)
     self.assertEqual(
         u'This \xa0 \xa0 has some extra spaces. and has "quotes"\n',
         parsed.content)
Esempio n. 12
0
    def test_link1(self):
        """Test parsing some links.

        """
        html = '<title>Here it <p/>is</title><body><foo class="1" id=top>body <a href="bar">link content</a><b/><a href="/foo2">2</p><a class="foo" href="http://bar.com/foo3">3</a> end body</foo><a href="test"><i>mo<em>re</em></i><b>test</a></body>'
        parsed = htmltotext.extract(html)
        self.assertEqual(parsed.title, u'Here it is')
        self.assertEqual(parsed.description, u'')
        self.assertEqual(parsed.keywords, u'')
        self.assertEqual(parsed.indexing_allowed, True)
        self.assertEqual(parsed.content,
                         'body link content2 3 end bodymoretest')
        self.assertEqual(len(parsed.links), 4)

        self.assertEqual(parsed.links[0].target, u'bar')
        self.assertEqual(parsed.links[0].text, u'link content')
        self.assertEqual(parsed.links[0].para, u'body link content2')
        self.assertEqual(parsed.links[0].start_pos, 4)
        self.assertEqual(parsed.links[1].target, u'/foo2')
        self.assertEqual(parsed.links[1].text, u'2')
        self.assertEqual(parsed.links[1].para, u'body link content2')
        self.assertEqual(parsed.links[1].start_pos, 17)
        self.assertEqual(parsed.links[2].target, u'http://bar.com/foo3')
        self.assertEqual(parsed.links[2].text, u'3')
        self.assertEqual(parsed.links[2].para, u'3 end bodymoretest')
        self.assertEqual(parsed.links[2].start_pos, 18)
        self.assertEqual(parsed.links[3].target, u'test')
        self.assertEqual(parsed.links[3].text, u'moretest')
        self.assertEqual(parsed.links[3].para, u'3 end bodymoretest')
        self.assertEqual(parsed.links[3].start_pos, 29)

        self.assertEqual(len(parsed.links[0].parent_tags), 3)
        self.assertEqual(parsed.links[0].parent_tags[0].name, "body")
        self.assertEqual(parsed.links[0].parent_tags[0].cls, "")
        self.assertEqual(parsed.links[0].parent_tags[0].id, "")
        self.assertEqual(parsed.links[0].parent_tags[1].name, "foo")
        self.assertEqual(parsed.links[0].parent_tags[1].cls, "1")
        self.assertEqual(parsed.links[0].parent_tags[1].id, "top")

        self.assertEqual(parsed.links[0].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[0].parent_tags[-1].cls, "")
        self.assertEqual(parsed.links[1].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[1].parent_tags[-1].cls, "")
        self.assertEqual(parsed.links[2].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[2].parent_tags[-1].cls, "foo")
        self.assertEqual(parsed.links[3].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[3].parent_tags[-1].cls, "")

        self.assertEqual(parsed.links[0].child_tags, [])
        self.assertEqual(parsed.links[1].child_tags, [])
        self.assertEqual(parsed.links[2].child_tags, [])
        self.assertEqual(len(parsed.links[3].child_tags), 3)
        self.assertEqual(parsed.links[3].child_tags[0].name, 'i')
        self.assertEqual(parsed.links[3].child_tags[1].name, 'em')
        self.assertEqual(parsed.links[3].child_tags[2].name, 'b')

        self.assertEqual(parsed.parastarts, [0, 18, 37])
Esempio n. 13
0
    def test_link2(self):
        """Test parsing some links interspersed with tags which don't close.

        """
        html = '<body><div><a href="a1"></a><br></div><a href="a2"></a><br><a href="a3"></a></body>'
        parsed = htmltotext.extract(html)
        self.assertEqual([t.name for t in parsed.links[0].parent_tags], ['body', 'div', 'a'])
        self.assertEqual([t.name for t in parsed.links[1].parent_tags], ['body', 'a'])
        self.assertEqual([t.name for t in parsed.links[2].parent_tags], ['body', 'a'])
Esempio n. 14
0
    def test_link1(self):
        """Test parsing some links.

        """
        html = '<title>Here it <p/>is</title><body><foo class="1" id=top>body <a href="bar">link content</a><b/><a href="/foo2">2</p><a class="foo" href="http://bar.com/foo3">3</a> end body</foo><a href="test"><i>mo<em>re</em></i><b>test</a></body>'
        parsed = htmltotext.extract(html)
        self.assertEqual(parsed.title, u'Here it\n is')
        self.assertEqual(parsed.description, u'')
        self.assertEqual(parsed.keywords, u'')
        self.assertEqual(parsed.indexing_allowed, True)
        self.assertEqual(parsed.content, 'body link content2\n3 end bodymoretest\n')
        self.assertEqual(len(parsed.links), 4)

        self.assertEqual(parsed.links[0].target, u'bar')
        self.assertEqual(parsed.links[0].text, u'link content')
        self.assertEqual(parsed.links[0].para, u'body link content2\n')
        self.assertEqual(parsed.links[0].start_pos, 4)
        self.assertEqual(parsed.links[1].target, u'/foo2')
        self.assertEqual(parsed.links[1].text, u'2\n')
        self.assertEqual(parsed.links[1].para, u'body link content2\n')
        self.assertEqual(parsed.links[1].start_pos, 17)
        self.assertEqual(parsed.links[2].target, u'http://bar.com/foo3')
        self.assertEqual(parsed.links[2].text, u'3')
        self.assertEqual(parsed.links[2].para, u'3 end bodymoretest\n')
        self.assertEqual(parsed.links[2].start_pos, 19)
        self.assertEqual(parsed.links[3].target, u'test')
        self.assertEqual(parsed.links[3].text, u'moretest')
        self.assertEqual(parsed.links[3].para, u'3 end bodymoretest\n')
        self.assertEqual(parsed.links[3].start_pos, 29)

        self.assertEqual(len(parsed.links[0].parent_tags), 3)
        self.assertEqual(parsed.links[0].parent_tags[0].name, "body")
        self.assertEqual(parsed.links[0].parent_tags[0].cls, "")
        self.assertEqual(parsed.links[0].parent_tags[0].id, "")
        self.assertEqual(parsed.links[0].parent_tags[1].name, "foo")
        self.assertEqual(parsed.links[0].parent_tags[1].cls, "1")
        self.assertEqual(parsed.links[0].parent_tags[1].id, "top")

        self.assertEqual(parsed.links[0].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[0].parent_tags[-1].cls, "")
        self.assertEqual(parsed.links[1].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[1].parent_tags[-1].cls, "")
        self.assertEqual(parsed.links[2].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[2].parent_tags[-1].cls, "foo")
        self.assertEqual(parsed.links[3].parent_tags[-1].name, "a")
        self.assertEqual(parsed.links[3].parent_tags[-1].cls, "")

        self.assertEqual(parsed.links[0].child_tags, [])
        self.assertEqual(parsed.links[1].child_tags, [])
        self.assertEqual(parsed.links[2].child_tags, [])
        self.assertEqual(len(parsed.links[3].child_tags), 3)
        self.assertEqual(parsed.links[3].child_tags[0].name, 'i')
        self.assertEqual(parsed.links[3].child_tags[1].name, 'em')
        self.assertEqual(parsed.links[3].child_tags[2].name, 'b')

        self.assertEqual(parsed.parastarts, [0, 19, 38])
def html_filter_from_stream(stream):
    p = htmltotext.extract(stream)
    yield "title", p.title
    yield "content", p.content
    yield "description", p.description
    kw = p.keywords.strip()
    if len(kw) != 0:
        for keyword in kw.split(','):
            yield "keyword", p.keywords
        yield "invalid", "he he"
Esempio n. 16
0
    def test_link2(self):
        """Test parsing some links interspersed with tags which don't close.

        """
        html = '<body><div><a href="a1"></a><br></div><a href="a2"></a><br><a href="a3"></a></body>'
        parsed = htmltotext.extract(html)
        self.assertEqual([t.name for t in parsed.links[0].parent_tags],
                         ['body', 'div', 'a'])
        self.assertEqual([t.name for t in parsed.links[1].parent_tags],
                         ['body', 'a'])
        self.assertEqual([t.name for t in parsed.links[2].parent_tags],
                         ['body', 'a'])
Esempio n. 17
0
    def test_unicode_input(self):
        """Test supplying a unicode string as input.

        """
        html = u'<meta http-equiv="content-type" content="charset=latin1"/><title>foo\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 18
0
    def test_meta_content_type(self):
        """Test supplying a meta http-equiv tag to set the character set.

        """
        html = '<meta http-equiv="content-type" content="charset=utf8"/><title>foo\xc2\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 19
0
    def test_meta_content_type(self):
        """Test supplying a meta http-equiv tag to set the character set.

        """
        html = '<meta http-equiv="content-type" content="charset=utf8"/><title>foo\xc2\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 20
0
    def test_unicode_input(self):
        """Test supplying a unicode string as input.

        """
        html = u'<meta http-equiv="content-type" content="charset=latin1"/><title>foo\xa3</title>'
        self.assertEqual(htmltotext.extract(html).title, u'foo\xa3')
Esempio n. 21
0
 def test_entity_decode(self):
     """Tests decoding of entities"""
     html = '<body>This &nbsp; &nbsp; has some extra spaces. and has &#x0022;quotes&#34;</body>'
     parsed = htmltotext.extract(html)
     self.assertEqual(u'This \xa0 \xa0 has some extra spaces. and has "quotes"\n', parsed.content)