def test_multipleRoot(self): parser = AdvancedHTMLParser() root1 = AdvancedTag('div') root1.setAttribute('id', 'div1') root2 = AdvancedTag('div') root2.setAttribute('id', 'div2') parser.parseStr(root1.outerHTML + root2.outerHTML) assert len( parser.getRootNodes()) == 2, 'Expected two root nodes on tree' foundRoot1 = parser.getElementById('div1') assert foundRoot1, 'Expected to find id=div1 in multi-root tree' foundRoot2 = parser.getElementById('div2') assert foundRoot2, 'Expected to find id=div1 in multi-root tree' combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace( '\n', '').strip() parsedHTML = parser.getHTML().replace('\n', '').strip() assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' % ( combinedHTML, parsedHTML)
def test_multipleRootsWithExternalTextSameReturn(self): html = """<span>Hello</span>Outside<span>World</span>End""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '').replace(' ','') assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" %(html, strippedHTML)
def test_multipleRootsSameReturn(self): html = """<span>Hello</span><span>World</span>""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '').replace(' ','') assert strippedHTML == html , "Expected multiple root nodes to retain, '%s' == '%s'" %(html, strippedHTML)
def test_multipleRootsWithExternalTextSameReturn(self): html = """<span>Hello</span>Outside<span>World</span>End""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '') assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" % ( html, strippedHTML)
def test_multipleRootsSameReturn(self): html = """<span>Hello</span><span>World</span>""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '') assert strippedHTML == html, "Expected multiple root nodes to retain, '%s' == '%s'" % ( html, strippedHTML)
def test_refTag(self): html = """<html><body><p>This is <html></p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert 'This is <html>' not in html, 'Expected to retain < and >, got %s' %(html,) assert 'This is <html>' in html, 'Expected to retain < and >, got %s' %(html,)
def test_untaggedText(self): html = """ <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML() assert '\n29\n' in html , 'Expected to find item outside tags: \\n29\\n in ' + str(html)
def test_untaggedText(self): html = """ <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML() assert '\n29\n' in html, 'Expected to find item outside tags: \\n29\\n in ' + str( html)
def test_commentRetainedPriorRoot(self): html = """<!-- CommentX --><html> <body><span>Hello</span></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) retHTML = parser.getHTML() assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
def test_textPriorToRoot(self): html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '') # print ( strippedHTML ) assert strippedHTML.startswith('Hello') , 'Expected text before root tag to be retained, got "%s"' %(strippedHTML,)
def test_commentRetainedAfterRoot(self): html = """<html> <body><span>Hello</span></body></html><!-- CommentX -->""" parser = AdvancedHTMLParser() parser.parseStr(html) retHTML = parser.getHTML() assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' % ( retHTML, )
def test_refTag(self): html = """<html><body><p>This is <html></p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert 'This is <html>' not in html, 'Expected to retain < and >, got %s' % ( html, ) assert 'This is <html>' in html, 'Expected to retain < and >, got %s' % ( html, )
def test_retainOriginalWhitespace(self): ''' test_retainOriginalWhitespace - Test that we retain the original whitespacing ''' parser = AdvancedHTMLParser() parser.parseStr(TEST_HTML) rawHtml = parser.getHTML() # This will not equal the original HTML exactly because we fixup some tag issues, like ' >' assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>' , 'Did not retain original whitespace like expected'
def test_retainOriginalWhitespace(self): ''' test_retainOriginalWhitespace - Test that we retain the original whitespacing ''' parser = AdvancedHTMLParser() parser.parseStr(TEST_HTML) rawHtml = parser.getHTML() # This will not equal the original HTML exactly because we fixup some tag issues, like ' >' assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>', 'Did not retain original whitespace like expected'
def test_nbsp(self): html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert ' ' in html, '(Will fail in python2..) Expected to retain got %s' %(html,) html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert ' ' not in html, '(Will fail in python2..) Expected not to insert got %s' %(html,) html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert 'Test One' in html, '(Will fail in python2..) Expected to retain original data with two got %s' %(html,)
def test_textPriorToRoot(self): html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) strippedHTML = parser.getHTML().replace('\n', '') # print ( strippedHTML ) assert strippedHTML.startswith( 'Hello' ), 'Expected text before root tag to be retained, got "%s"' % ( strippedHTML, )
def test_nbsp(self): html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert ' ' in html, '(Will fail in python2..) Expected to retain got %s' % ( html, ) html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert ' ' not in html, '(Will fail in python2..) Expected not to insert got %s' % ( html, ) html = """<html><body><p>Test One</p></body></html>""" parser = AdvancedHTMLParser() parser.parseStr(html) html = parser.getHTML().replace('\n', '').replace('html ', 'html') assert 'Test One' in html, '(Will fail in python2..) Expected to retain original data with two got %s' % ( html, )
def test_multipleRoot(self): parser = AdvancedHTMLParser() root1 = AdvancedTag('div') root1.setAttribute('id', 'div1') root2 = AdvancedTag('div') root2.setAttribute('id', 'div2') parser.parseStr(root1.outerHTML + root2.outerHTML) assert len(parser.getRootNodes()) == 2, 'Expected two root nodes on tree' foundRoot1 = parser.getElementById('div1') assert foundRoot1, 'Expected to find id=div1 in multi-root tree' foundRoot2 = parser.getElementById('div2') assert foundRoot2, 'Expected to find id=div1 in multi-root tree' combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace('\n', '').strip() parsedHTML = parser.getHTML().replace('\n', '').strip() assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' %(combinedHTML, parsedHTML)