def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 = AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(
            parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace(
            '\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' % (
            combinedHTML, parsedHTML)
    def test_multipleRootsWithExternalTextSameReturn(self):
        html = """<span>Hello</span>Outside<span>World</span>End"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')
        assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" %(html, strippedHTML)
    def test_multipleRootsSameReturn(self):
        html = """<span>Hello</span><span>World</span>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')

        assert strippedHTML == html , "Expected multiple root nodes to retain, '%s' == '%s'" %(html, strippedHTML)
    def test_multipleRootsWithExternalTextSameReturn(self):
        html = """<span>Hello</span>Outside<span>World</span>End"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '')
        assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" % (
            html, strippedHTML)
    def test_multipleRootsSameReturn(self):
        html = """<span>Hello</span><span>World</span>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '')

        assert strippedHTML == html, "Expected multiple root nodes to retain, '%s' == '%s'" % (
            html, strippedHTML)
Esempio n. 6
0
    def test_refTag(self):
        html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
        assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
    def test_untaggedText(self):
        html = """    <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML()

        assert '\n29\n' in html , 'Expected to find item outside tags: \\n29\\n in ' + str(html)
    def test_untaggedText(self):
        html = """    <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML()

        assert '\n29\n' in html, 'Expected to find item outside tags: \\n29\\n in ' + str(
            html)
    def test_commentRetainedPriorRoot(self):
        html = """<!-- CommentX --><html>
        <body><span>Hello</span></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        retHTML = parser.getHTML()

        assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
    def test_textPriorToRoot(self):
        html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '')

#        print ( strippedHTML )
        assert strippedHTML.startswith('Hello') , 'Expected text before root tag to be retained, got "%s"' %(strippedHTML,)
    def test_commentRetainedAfterRoot(self):
        html = """<html>
        <body><span>Hello</span></body></html><!-- CommentX -->"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        retHTML = parser.getHTML()

        assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' % (
            retHTML, )
Esempio n. 12
0
    def test_refTag(self):
        html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' % (
            html, )
        assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' % (
            html, )
    def test_retainOriginalWhitespace(self):
        '''
            test_retainOriginalWhitespace - Test that we retain the original whitespacing
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        rawHtml = parser.getHTML()

        # This will not equal the original HTML exactly because we fixup some tag issues, like ' >'
        assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>' , 'Did not retain original whitespace like expected'
    def test_retainOriginalWhitespace(self):
        '''
            test_retainOriginalWhitespace - Test that we retain the original whitespacing
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        rawHtml = parser.getHTML()

        # This will not equal the original HTML exactly because we fixup some tag issues, like ' >'
        assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>', 'Did not retain original whitespace like expected'
Esempio n. 15
0
    def test_nbsp(self):
        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' %(html,)

        html = """<html><body><p>Test One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' %(html,)

        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' %(html,)
    def test_textPriorToRoot(self):
        html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '')

        #        print ( strippedHTML )
        assert strippedHTML.startswith(
            'Hello'
        ), 'Expected text before root tag to be retained, got "%s"' % (
            strippedHTML, )
Esempio n. 17
0
    def test_nbsp(self):
        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' % (
            html, )

        html = """<html><body><p>Test One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' % (
            html, )

        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' % (
            html, )
Esempio n. 18
0
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 =  AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace('\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' %(combinedHTML, parsedHTML)