def test_ownerDocument(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")

        outerEm = parser.getElementById('outer')

        assert outerEm.ownerDocument == parser , 'Expected the ownerDocument to be set to parser'

        for element in outerEm.getAllNodes():
            assert element.ownerDocument == parser, 'Expected ownerDocument to be set on every element. Was not set on: %s' %(element.getStartTag(),)


        clonedEm = outerEm.cloneNode()

        assert clonedEm.parentNode is None , 'Expected cloned child to have no parent'
        assert clonedEm.ownerDocument is None , 'Expected cloned child to have no owner document'

        assert len(clonedEm.children) == 0 , 'Expected cloned element to have no children'

        itemsEm = outerEm.removeChild(outerEm.children[0])

        assert itemsEm , 'Expected removeChild to return removed element'

        assert itemsEm.id == 'items' , 'Got wrong element, expected to remove "items", got: %s' %(itemsEm.getStartTag(),)

        assert itemsEm.ownerDocument is None , 'Expected owner document to be set to None after element was removed.'

        for subElement in itemsEm.getAllChildNodes():
            assert subElement.ownerDocument is None, 'Expected owner document to be cleared on all children after removal from document'
    def test_appending(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>""")

        itemsEm = parser.getElementById('items')
        assert itemsEm , 'Expected  to get <div id="outer" '

        assert len(itemsEm.children) == 2 , 'Expected two children'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item3' }
        )

        itemsEm.appendNode(newItem)

        assert parser.getElementById('item3') , 'Expected to get item3 after append'
        assert len(parser.getElementsByName('item')) == 3, 'Expected after append that 3 nodes are  set'
        assert itemsEm.children[2].getAttribute('id') == 'item3' , 'Expected to be third attribute'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item2point5' }
        )

        itemsEm.insertAfter(newItem, itemsEm.children[1])
        childIds = [x.id for x in itemsEm.getElementsByName('item')]

        assert childIds == ['item1', 'item2', 'item2point5', 'item3'] , 'Expected items to be ordered. Got: %s' %(str(childIds,))
    def test_multipleRootsWithExternalTextSameReturn(self):
        html = """<span>Hello</span>Outside<span>World</span>End"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')
        assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" %(html, strippedHTML)
    def test_parsing(self):
        '''
            test_parsing - Test that the parser properly handles several cases of class attribute,
                             and that they are mutable in expected ways thereafter.
        '''

        someHtml = '''<html><body>
        <div class="one two three" id="firstDiv">Some text</div>
        <div id="secondDiv">This one is empty</div>
        <div class="three ZZ AA" id="thirdDiv">Last one</div>
        <div class="" id="emptyClassDiv">Empty</div>
</body></html>'''

        document = AdvancedHTMLParser()
        document.parseStr(someHtml)

        firstDiv = document.getElementById('firstDiv')
        secondDiv = document.getElementById('secondDiv')
        thirdDiv = document.getElementById('thirdDiv')
        emptyClassDiv = document.getElementById('emptyClassDiv')


        assert firstDiv , 'Failed to get element by id="firstDiv"'
        assert secondDiv , 'Failed to get element by id="secondDiv"'
        assert thirdDiv , 'Failed to get element by id="thirdDiv"'
        assert emptyClassDiv , 'Failed to get element by id="emptyClassDiv"'

        firstDivHTML = firstDiv.getHTML()
        secondDivHTML = secondDiv.getHTML()
        thirdDivHTML = thirdDiv.getHTML()
        emptyClassDivHTML = emptyClassDiv.getHTML()

        assert 'class="one two three"' in firstDivHTML , 'Expected string of class to show up in parsed html. Got: ' + firstDivHTML
        assert 'class=' not in secondDivHTML , 'Expected class attribute to not be present when no class set. Got: ' + secondDivHTML
        assert 'class="three ZZ AA"' in thirdDivHTML , 'Expected string of class to show up in parsed html. Got: ' + thirdDivHTML
        assert 'class=' not in emptyClassDivHTML , 'Expected class attribute to not be present when class set to empty in parsed html, i.e. class="". Got: ' + emptyClassDivHTML


        assert firstDiv.className == "one two three" , "Expected parsed className to match 'one two three' Got: " + repr(firstDiv.className)
        assert secondDiv.className == "" , "Expected parsed lack of className to match empty string, \"\" Got: " + repr(secondDiv.className)
        assert thirdDiv.className == "three ZZ AA" , "Expected parsed className to match 'three ZZ AA' Got: " + repr(thirdDiv.className)

        assert emptyClassDiv.className == "" , "Expected parse empty className to remain empty string. Got: " + repr(emptyClassDiv.className)

        assert firstDiv.classList == ["one", "two", "three"] , 'wrong classList'
        assert secondDiv.classList == [] , "wrong classList"
        assert thirdDiv.classList == ["three", "ZZ", "AA"] , "wrong classList"
        assert emptyClassDiv.classList == [] , "Wrong classList"

        # Check that we can modify and it shows up
        firstDiv.setAttribute('class', 'cheese is good')

        firstDivHTML = firstDiv.getHTML()

        assert 'class="cheese is good"' in firstDivHTML , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected in tag attribute. Got: " + firstDivHTML

        assert firstDiv.className == "cheese is good" , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + firstDiv.className

        assert firstDiv.classList == ["cheese", "is", "good"] , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + repr(firstDiv.classList)
    def test_multipleRootsSameReturn(self):
        html = """<span>Hello</span><span>World</span>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')

        assert strippedHTML == html , "Expected multiple root nodes to retain, '%s' == '%s'" %(html, strippedHTML)
    def test_firstLastChild(self):
        '''
            test_firstChild - test

                AdvancedTag.firstChild and AdvancedTag.firstElementChild
                AdvancedTag.lastChild and AdvancedTag.lastElementChild
        '''
        document = AdvancedHTMLParser()
        document.parseStr('<div id="main">Hello<div id="two">Blah</div><div id="emptyDiv"></div><div id="three">Three</div>End Text</div>')


        mainEm = document.getElementById('main')

        assert mainEm , "Failed to get element by id='main'"

        assert mainEm.id == 'main' , 'Got wrong element for id="main"'

        firstChild = mainEm.firstChild

        assert firstChild == 'Hello' , 'Expected .firstChild to return the first block child, str("Hello") but got: %s(%s)' %( firstChild.__class__.__name__, repr(firstChild))

        firstChildEm = mainEm.firstElementChild

        assert issubclass(firstChildEm.__class__, AdvancedTag) , 'Expected firstElementChild to return an AdvancedTag object. Got: ' + firstChildEm.__class__.__name__

        assert firstChildEm.tagName == 'div' and firstChildEm.id == 'two' , 'Expected to get div id="two" as firstElementChild. Got: %s(%s)' %( firstChildEm.__class__.__name__, repr(firstChildEm))

        lastChild = mainEm.lastChild

        assert lastChild == "End Text" , 'Expected .lastChild to return the last block child, str("End Text") but got: %s(%s)' %( lastChild.__class__.__name__, repr(lastChild))

        lastChildEm = mainEm.lastElementChild

        assert issubclass(lastChildEm.__class__, AdvancedTag) , 'Expected lastElementChild to return an AdvancedTag object. Got: ' + lastChildEm.__class__.__name__

        assert lastChildEm.tagName == 'div' and lastChildEm.id == 'three' , 'Expected to get div id="three" as lastElementChild. Got: %s(%s)' %( lastChildEm.__class__.__name__, repr(lastChildEm))


        emptyDivEm = document.getElementById('emptyDiv')

        assert emptyDivEm , 'Failed to get element by id="emptyDiv"'
        assert emptyDivEm.id == 'emptyDiv' , 'Got wrong element for id="emptyDiv"'

        firstChildEmpty = emptyDivEm.firstChild

        assert firstChildEmpty is None , 'Expected empty div .firstChild to be None (null). Got: ' + repr(firstChildEmpty)

        firstChildElementEmpty = emptyDivEm.firstElementChild

        assert firstChildElementEmpty is None , 'Expected empty div .firstElementChild to be None (null). Got: ' + repr(firstChildElementEmpty)

        lastChildEmpty = emptyDivEm.lastChild

        assert lastChildEmpty is None , 'Expected empty div .lastChild to be None (null). Got: ' + repr(lastChildEmpty)

        lastChildElementEmpty = emptyDivEm.lastElementChild

        assert lastChildElementEmpty is None , 'Expected empty div .lastElementChild to be None (null). Got: ' + repr(lastChildElementEmpty)
    def test_ParseStr(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_refTag(self):
        html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
        assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
    def test_untaggedText(self):
        html = """    <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML()

        assert '\n29\n' in html , 'Expected to find item outside tags: \\n29\\n in ' + str(html)
    def test_HandleMultipleRoot(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MULTIPLE_ROOT)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with multiple root nodes')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find first element'
        assert len(parser.getRootNodes()) == 2
    def test_HandleMissClose(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MISS_CLOSE)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with missed close')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find id="one"'
        assert oneEm.children[0].innerHTML.strip() == 'Hello' , 'Could not find child tag'
    def test_commentRetainedPriorRoot(self):
        html = """<!-- CommentX --><html>
        <body><span>Hello</span></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        retHTML = parser.getHTML()

        assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
    def test_textPriorToRoot(self):
        html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '')

#        print ( strippedHTML )
        assert strippedHTML.startswith('Hello') , 'Expected text before root tag to be retained, got "%s"' %(strippedHTML,)
    def test_encodingWorkingStr(self):
        parser = AdvancedHTMLParser(encoding='ascii')

        gotException = False
        try:
            parser.parseStr(TEST_HTML)
        except UnicodeDecodeError as e:
            gotException = True

        assert gotException is True, 'Should have failed to parse unicode characters in ascii codec, probably not using passed encoding'
    def testPreviousSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>')
        
        root = parser.getRoot()

        assert root.getElementById('one').previousSibling == 'Head Text' , 'Expected to get "Head Text" as first sibling'
        assert root.getElementById('one').previousSiblingElement == None , 'Expected to get no element prior to first sibling'

        assert root.getElementById('two').previousSibling.id == 'one' , 'Expected to get element  "one" prior to two'
        assert root.getElementById('two').previousSiblingElement.id == 'one' , 'Expected to get element  "one" prior to two'
    def test_getMiniHTML(self):
        '''
            test_getMiniHTML - Gets a "mini" representation that only contains the functional whitespace characters in HTML repr
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        miniHTML = parser.getMiniHTML()

        assert miniHTML == '<html ><head ><title >Hello World</title></head> <body > <div >Hello world <span >And welcome to the show.</span> </div> </body></html>'
    def test_retainOriginalWhitespace(self):
        '''
            test_retainOriginalWhitespace - Test that we retain the original whitespacing
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        rawHtml = parser.getHTML()

        # This will not equal the original HTML exactly because we fixup some tag issues, like ' >'
        assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>' , 'Did not retain original whitespace like expected'
    def testNextSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>')
        
        root = parser.getRoot()

        assert root.getElementById('one').nextSibling.id == 'two' , 'Expected to get element with id "two"'
        assert root.getElementById('one').nextSiblingElement.id == 'two' , 'Expected to get element with id "two"'

        assert root.getElementById('two').nextSibling == 'Another Item' , 'Expected to get text "Another Item" after item id=two'
        assert root.getElementById('two').nextSiblingElement.id == 'three' , 'Expected to get element with id "three"'

        assert root.getElementById('three').nextSibling == None , 'Expected to get no element after id="three"'
        assert root.getElementById('three').nextSiblingElement == None , 'Expected to get no element after id="three"'
    def test_getFormattedHTML(self):
        '''
            test_getFormattedHTML - Tests the getFormattedHTML call for pretty-printing HTML
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        formattedHTML = parser.getFormattedHTML()

        assert formattedHTML == '\n<html >\n  <head >\n    <title >Hello World\n    </title>\n  </head> \n  <body > \n    <div >Hello world \n      <span >And welcome to the show.\n      </span> \n    </div> \n  </body>\n</html>' , 'Did not get expected formatting using default 4 spaces.'

        formattedHTMLTabIndent = parser.getFormattedHTML('\t')

        assert formattedHTMLTabIndent == '\n<html >\n\t<head >\n\t\t<title >Hello World\n\t\t</title>\n\t</head> \n\t<body > \n\t\t<div >Hello world \n\t\t\t<span >And welcome to the show.\n\t\t\t</span> \n\t\t</div> \n\t</body>\n</html>' , 'Did not get expected formatting using tabs.'
    def test_cloneNode(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('''
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        ''')

        helloEm = parser.getElementById('hello')

        helloClone = helloEm.cloneNode()

        for attributeName in ('id', 'class', 'cheese'):
            helloEmValue = helloEm.getAttribute(attributeName, None)
            helloCloneValue = helloClone.getAttribute(attributeName, None)
            assert helloEmValue == helloCloneValue, 'Expected cloneNode to return an exact copy, got different %s. %s != %s' %(attributeName, repr(helloEmValue), repr(helloCloneValue))

        assert helloEm.childElementCount == 2 , 'Expected original helloEm to retain two direct children'
        assert helloClone.childElementCount == 0 , 'Expected clone to NOT copy children'
    def test_removeAndContains(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")


        itemsEm = parser.getElementById('items')
        item1Em = parser.getElementById('item1')
        subItem1 = parser.getElementById('subItem1')

        assert itemsEm.hasChild(item1Em) is True, 'Expected itemsEm to have item1Em as a child.'

        assert parser.getElementById('subItem1') is not None, 'Expected to find id=subItem1'

        assert itemsEm.contains(item1Em) , 'Expected itemsEm to contain items1Em'
        assert itemsEm.contains(subItem1) , 'Expected itemsEm to contain subItem1'

        assert subItem1.uid in itemsEm.getAllNodeUids()

        assert parser.contains(item1Em) , 'Expected parser to contain item1Em via contains'
        assert item1Em in parser, 'Expected parser to contain item1Em via in operator'

        assert item1Em.ownerDocument == parser , 'Expected ownerDocument to be set prior to remove'

        # Remove item1 from the tree
        item1Em.remove()

        assert itemsEm.hasChild(item1Em) is False, 'Expected after remove for item1Em to no longer be a child of itemsEm'

        assert parser.getElementById('item1') is None, 'Expected to not be able to find id=item1 after remove'

        assert parser.getElementById('subItem1') is None, 'Expected to not be able to find sub item of id=item1, id=subItem1 after remove.'

        assert item1Em.parentNode is None , 'Expected parentNode on item1Em to be None after remove.'

        assert not itemsEm.contains(item1Em) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.containsUid(item1Em.uid) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.contains(subItem1) , 'Expected itemsEm to not contain subItem1'

        assert subItem1.uid not in itemsEm.getAllNodeUids()

        assert not parser.contains(item1Em) , 'Expected parser to not contain item1Em via contains'
        assert item1Em not in parser, 'Expected parser to not contain item1Em via in operator'

        assert item1Em.ownerDocument is None , 'Expected owner document to be unset upon removal'
    def test_setRoot(self):
        parser =  AdvancedHTMLParser()
        assert not parser.root, 'Root should start blank'

        root = AdvancedTag('html')
        parser.setRoot(root)

        assert parser.root  , 'Expected root to be set'
        assert parser.root.tagName  == 'html'  , 'Expected root node to be tagName=html'

        parser.reset()

        assert not parser.root,  'Expected parser root to be blank after reset is called'

        parser.parseStr(root.outerHTML)
        root = parser.getRoot()

        assert parser.root  , 'Expected root to be set'
        assert parser.root.tagName  == 'html'  , 'Expected root node to be tagName=html'
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 =  AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace('\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' %(combinedHTML, parsedHTML)
Exemple #24
0
    def test_nextSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr(
            '<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>'
        )

        root = parser.getRoot()

        assert root.getElementById(
            'one'
        ).nextSibling.id == 'two', 'Expected to get element with id "two"'
        assert root.getElementById(
            'one'
        ).nextElementSibling.id == 'two', 'Expected to get element with id "two"'
        assert root.getElementById(
            'one'
        ).nextSiblingElement.id == 'two', 'Expected to get element with id "two"'

        assert root.getElementById(
            'two'
        ).nextSibling == 'More Text', 'Expected to get text "Another Item" after item id=two'
        assert root.getElementById(
            'two'
        ).nextElementSibling.id == 'three', 'Expected to get element with id "three"'
        assert root.getElementById(
            'two'
        ).nextSiblingElement.id == 'three', 'Expected to get element with id "three"'

        assert root.getElementById(
            'three'
        ).nextSibling == None, 'Expected to get no element after id="three"'
        assert root.getElementById(
            'three'
        ).nextElementSibling == None, 'Expected to get no element after id="three"'
        assert root.getElementById(
            'three'
        ).nextSiblingElement == None, 'Expected to get no element after id="three"'
    def test_nbsp(self):
        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' %(html,)

        html = """<html><body><p>Test One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' %(html,)

        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' %(html,)
    def test_nbsp(self):
        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' % (
            html, )

        html = """<html><body><p>Test One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' % (
            html, )

        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' % (
            html, )
    def test_removeAndContains(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        itemsEm = parser.getElementById('items')
        item1Em = parser.getElementById('item1')
        subItem1 = parser.getElementById('subItem1')

        assert itemsEm.hasChild(
            item1Em) is True, 'Expected itemsEm to have item1Em as a child.'

        assert parser.getElementById(
            'subItem1') is not None, 'Expected to find id=subItem1'

        assert itemsEm.contains(
            item1Em), 'Expected itemsEm to contain items1Em'
        assert itemsEm.contains(
            subItem1), 'Expected itemsEm to contain subItem1'

        assert subItem1.uid in itemsEm.getAllNodeUids()

        assert parser.contains(
            item1Em), 'Expected parser to contain item1Em via contains'
        assert item1Em in parser, 'Expected parser to contain item1Em via in operator'

        assert item1Em.ownerDocument == parser, 'Expected ownerDocument to be set prior to remove'

        # Remove item1 from the tree
        item1Em.remove()

        assert itemsEm.hasChild(
            item1Em
        ) is False, 'Expected after remove for item1Em to no longer be a child of itemsEm'

        assert parser.getElementById(
            'item1'
        ) is None, 'Expected to not be able to find id=item1 after remove'

        assert parser.getElementById(
            'subItem1'
        ) is None, 'Expected to not be able to find sub item of id=item1, id=subItem1 after remove.'

        assert item1Em.parentNode is None, 'Expected parentNode on item1Em to be None after remove.'

        assert not itemsEm.contains(
            item1Em), 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.containsUid(
            item1Em.uid), 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.contains(
            subItem1), 'Expected itemsEm to not contain subItem1'

        assert subItem1.uid not in itemsEm.getAllNodeUids()

        assert not parser.contains(
            item1Em), 'Expected parser to not contain item1Em via contains'
        assert item1Em not in parser, 'Expected parser to not contain item1Em via in operator'

        assert item1Em.ownerDocument is None, 'Expected owner document to be unset upon removal'
    def test_tagOperators(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
        </div>
</body></html>''')

        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        tagsEq = (helloTag == hello2Tag)

        assert tagsEq is False, "Expected different tags with same attributes names to not be =="

        tagsNe = (helloTag != hello2Tag)

        assert tagsNe is True, "Expected different tags with same attributes names to be !="

        sameTagEq = (helloTag == helloTag)

        assert sameTagEq is True, "Expected same tag to == itself"

        diffTagsEq = (helloTag == goodbyeTag)

        assert diffTagsEq is False, "Expected different tags with different attributes to not be =="

        diffTagsNe = (helloTag != goodbyeTag)

        assert diffTagsNe is True, "Expected different tags with different attributes to be !="

        helloTagCopy = copy.copy(helloTag)

        copyEq = (helloTag == helloTagCopy)

        assert copyEq is False, "Expected copy of tag to not == original"

        copyNe = (helloTag != helloTagCopy)

        assert copyNe is True, "Expected copy of tag to != original"

        helloTagCopyRecon = AdvancedTag(helloTag.tagName,
                                        helloTag.getAttributesList(),
                                        helloTag.isSelfClosing)

        copyEq = (helloTag == helloTagCopyRecon)

        assert copyEq is False, "Expected reconstruction of tag to not == original"

        copyNe = (helloTag != helloTagCopyRecon)

        assert copyNe is True, "Expected reconstruction of tag to != original"

        helloTagFetch2 = parser.getElementById('hello')

        fetchEq = (helloTag == helloTagFetch2)

        assert fetchEq is True, "Expected fetching the same tag is =="

        fetchNe = (helloTag != helloTagFetch2)

        assert fetchNe is False, "Expected fetching the same tag to not be !="

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]

        childrenEq = (child1 == child2)

        assert childrenEq is False, "Expected elements with exact same attributes and values but different individual tags to not be =="

        childrenNe = (child1 != child2)

        assert childrenNe is True, "Expected elements with exact same attributes and values but different individual tags to be !="
    def test_tagOperators(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        tagsEq = ( helloTag == hello2Tag )

        assert tagsEq is False , "Expected different tags with same attributes names to not be =="

        tagsNe = ( helloTag != hello2Tag )

        assert tagsNe is True, "Expected different tags with same attributes names to be !="

        sameTagEq = ( helloTag == helloTag )

        assert sameTagEq is True, "Expected same tag to == itself"

        diffTagsEq = (helloTag == goodbyeTag)

        assert diffTagsEq is False, "Expected different tags with different attributes to not be =="

        diffTagsNe = (helloTag != goodbyeTag)

        assert diffTagsNe is True, "Expected different tags with different attributes to be !="

        helloTagCopy = copy.copy(helloTag)

        copyEq = (helloTag == helloTagCopy)

        assert copyEq is False, "Expected copy of tag to not == original"

        copyNe = (helloTag != helloTagCopy)

        assert copyNe is True, "Expected copy of tag to != original"

        helloTagCopyRecon = AdvancedTag(helloTag.tagName, helloTag.getAttributesList(), helloTag.isSelfClosing)

        copyEq = (helloTag == helloTagCopyRecon)

        assert copyEq is False , "Expected reconstruction of tag to not == original"

        copyNe = (helloTag != helloTagCopyRecon)

        assert copyNe is True, "Expected reconstruction of tag to != original"

        helloTagFetch2 = parser.getElementById('hello')

        fetchEq = (helloTag == helloTagFetch2)

        assert fetchEq is True, "Expected fetching the same tag is =="

        fetchNe = (helloTag != helloTagFetch2)

        assert fetchNe is False, "Expected fetching the same tag to not be !="

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]

        childrenEq = (child1 == child2)

        assert childrenEq is False, "Expected elements with exact same attributes and values but different individual tags to not be =="

        childrenNe = (child1 != child2)

        assert childrenNe is True, "Expected elements with exact same attributes and values but different individual tags to be !="
    def test_isTagEqual(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classY classX" cheese="gouda">Blah</div>
        </div>
        <div id="sameAttrChildrenSpans">
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classY classX" cheese="gouda">Blah</span>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'


        helloTagsEq = (helloTag.isTagEqual(hello2Tag))

        assert helloTagsEq is False, "Expected tags with same attribute names but different values (id) to not be equal."

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]
        child3 = sameAttrChildrenEm.children[2]

        assert child1.isTagEqual(child2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(child3) is False, "Expected tags with exact same tag name and attributes (but class name in different order) return isTagEqual as False"

        # TODO: Style should compare the same regardless of order

        sameAttrChildrenSpansEm = parser.getElementById('sameAttrChildrenSpans')

        childSpan1 = sameAttrChildrenSpansEm[0]
        childSpan2 = sameAttrChildrenSpansEm[1]
        childSpan3 = sameAttrChildrenSpansEm[2]

        assert childSpan1.isTagEqual(childSpan2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(childSpan1) is False, "Expected tags with exact same attributes but different tag name to return isTagEqual as False"

        child1Copy = copy.copy(child1)

        assert child1.isTagEqual(child1Copy) is True, "Expected copy of tag to return isTagEqual as True"

        # Do a deep copy so we can change attributes and not affect the former
        child1Copy = copy.copy(child1)

        child1Copy.setAttribute("cheese", "none")

        assert child1.isTagEqual(child1Copy) is False, "Expected same tag name same attribute names but different value to return isTagEqual as False"
Exemple #31
0
    def test_getAttributesList(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            '<div id="hello" style="display: none; width: 500px; padding-left: 15px;" class="One Two" data="Yes">Hello</div>'
        )

        helloEm = parser.getElementById('hello')

        assert helloEm.getAttribute('id',
                                    '') == 'hello', 'Got unxpected element'

        attributesList = helloEm.getAttributesList()

        foundId = False
        foundStyle = False
        foundClass = False
        foundData = False

        for attrName, attrValue in attributesList:
            if attrName == 'id':
                assert attrValue == 'hello', 'Attribute "id" did not have expected value "hello", got "%s"' % (
                    attrValue, )

                foundId = True
            elif attrName == 'style':

                style = StyleAttribute(attrValue)
                assert style.display == 'none', 'Got unexpected value for display in style copy. Expected "none", got "%s"' % (
                    style.display, )
                assert style.width == '500px', 'Got unexpected value for width in style copy. Expected "500px", got "%s"' % (
                    style.width, )
                assert style.paddingLeft == '15px', 'Got unexpected value for padding-left. Expected "15px", got "%s"' % (
                    style.paddingLeft, )

                foundStyle = True
            elif attrName == 'class':

                assert attrValue == 'One Two', 'Expected class name to equal "One Two", got: %s' % (
                    attrValue, )

                foundClass = True
            elif attrName == 'data':

                assert attrValue == 'Yes', 'Expected attribute "data" to have the value "Yes", got: %s' % (
                    attrValue, )

                foundData = True

            else:
                raise AssertionError(
                    'Got unexpected attribute in copy: (%s, %s)' %
                    (attrName, attrValue))

        assert foundId is True, 'Did not find id element in attribute list'
        assert foundStyle is True, 'Did not find style element in attribute list'
        assert foundClass is True, 'Did not find class element in attribute list'
        assert foundData is True, 'Did not find data element in attribute list'

        # Test that we have a COPY, not the originals

        for item in attributesList:
            if item[0] == 'style':
                # Just incase in the future we want to include a StyleAttribute instead of the str
                if not isinstance(item[1], StyleAttribute):
                    style = StyleAttribute(item[1])
                else:
                    style = item[1]
                style.paddingTop = '10px'

        # These should not be modified in the original element
        assert 'padding-top' not in str(helloEm.style)
    def test_parsing(self):
        '''
            test_parsing - Test that the parser properly handles several cases of class attribute,
                             and that they are mutable in expected ways thereafter.
        '''

        someHtml = '''<html><body>
        <div class="one two three" id="firstDiv">Some text</div>
        <div id="secondDiv">This one is empty</div>
        <div class="three ZZ AA" id="thirdDiv">Last one</div>
        <div class="" id="emptyClassDiv">Empty</div>
</body></html>'''

        document = AdvancedHTMLParser()
        document.parseStr(someHtml)

        firstDiv = document.getElementById('firstDiv')
        secondDiv = document.getElementById('secondDiv')
        thirdDiv = document.getElementById('thirdDiv')
        emptyClassDiv = document.getElementById('emptyClassDiv')

        assert firstDiv, 'Failed to get element by id="firstDiv"'
        assert secondDiv, 'Failed to get element by id="secondDiv"'
        assert thirdDiv, 'Failed to get element by id="thirdDiv"'
        assert emptyClassDiv, 'Failed to get element by id="emptyClassDiv"'

        firstDivHTML = firstDiv.getHTML()
        secondDivHTML = secondDiv.getHTML()
        thirdDivHTML = thirdDiv.getHTML()
        emptyClassDivHTML = emptyClassDiv.getHTML()

        assert 'class="one two three"' in firstDivHTML, 'Expected string of class to show up in parsed html. Got: ' + firstDivHTML
        assert 'class=' not in secondDivHTML, 'Expected class attribute to not be present when no class set. Got: ' + secondDivHTML
        assert 'class="three ZZ AA"' in thirdDivHTML, 'Expected string of class to show up in parsed html. Got: ' + thirdDivHTML
        assert 'class=' not in emptyClassDivHTML, 'Expected class attribute to not be present when class set to empty in parsed html, i.e. class="". Got: ' + emptyClassDivHTML

        assert firstDiv.className == "one two three", "Expected parsed className to match 'one two three' Got: " + repr(
            firstDiv.className)
        assert secondDiv.className == "", "Expected parsed lack of className to match empty string, \"\" Got: " + repr(
            secondDiv.className)
        assert thirdDiv.className == "three ZZ AA", "Expected parsed className to match 'three ZZ AA' Got: " + repr(
            thirdDiv.className)

        assert emptyClassDiv.className == "", "Expected parse empty className to remain empty string. Got: " + repr(
            emptyClassDiv.className)

        assert firstDiv.classList == ["one", "two", "three"], 'wrong classList'
        assert secondDiv.classList == [], "wrong classList"
        assert thirdDiv.classList == ["three", "ZZ", "AA"], "wrong classList"
        assert emptyClassDiv.classList == [], "Wrong classList"

        # Check that we can modify and it shows up
        firstDiv.setAttribute('class', 'cheese is good')

        firstDivHTML = firstDiv.getHTML()

        assert 'class="cheese is good"' in firstDivHTML, "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected in tag attribute. Got: " + firstDivHTML

        assert firstDiv.className == "cheese is good", "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + firstDiv.className

        assert firstDiv.classList == [
            "cheese", "is", "good"
        ], "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + repr(
            firstDiv.classList)
    def getItemsParser(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>""")

        return parser
    def test_getElementsByClassName(self):
        '''
            test_getElementsByClassName - Test the getElementsByClassName method
        '''

        html = '''<html><head><title>Page</title></head>
<body class="background">
  <div id="outer" class="outer">
   <div class="inner special">Hello</div>
   <div class="inner cheese">
     <div class="blah" id="blahdiv1">One</div>
       <span>
         <div class="blah" id="blahdiv2" >
         </div>
       </span>
     </div>
   </div>
  </div>
</body>
</html>
        '''
        document = AdvancedHTMLParser()
        document.parseStr(html)

        tags = document.getElementsByClassName('background')
        assert len(tags) == 1 and tags[
            0].tagName == 'body', 'Expected to get body tag for getElementsByClassName("background")'

        tags = document.getElementsByClassName("inner")
        assert len(tags) == 2 and tags[0].tagName == 'div' and tags[
            1].tagName == 'div', 'Expected to find 2 div tags with class="inner"'

        assert "inner" in tags[0].classNames and "inner" in tags[
            1].classNames, 'Expected to find "inner" in the classNames list'

        assert issubclass(
            tags[0].classNames.__class__,
            (list, tuple)), 'Expected .classNames to be a list of class names'

        assert tags[0].className.startswith(
            "inner") and tags[1].className.startswith(
                "inner"
            ), 'Expected to find "inner" at start of className string'

        specialDiv = None
        cheeseDiv = None
        for tag in tags:
            if "cheese" in tag.classNames:
                cheeseDiv = tag
            elif "special" in tag.classNames:
                specialDiv = tag

        assert specialDiv, 'Failed to find div with "special" in className'
        assert cheeseDiv, 'Failed to find div with "cheese" in className'

        assert 'Hello' in specialDiv.innerHTML, 'Expected "Hello" to be inside special div'

        assert specialDiv.getElementsByClassName(
            'bogus'
        ) == [], 'Expected to get no results for specialDiv.getElementsByClassName("bogus")'

        blahDivsDocument = document.getElementsByClassName("blah")
        blahDivsCheese = cheeseDiv.getElementsByClassName("blah")

        assert len(
            blahDivsDocument
        ) == 2, 'Expected to get 2 class="blah" divs from document, but got ' + str(
            len(blahDivsDocument))

        assert len(
            blahDivsCheese
        ) == 2, 'Expected to get 2 class="blah" divs from cheeseDiv, but got ' + str(
            len(blahDivsCheese))

        blahDiv1 = None
        blahDiv2 = None

        for blahDiv in blahDivsDocument:
            if blahDiv.id == 'blahdiv1':
                blahDiv1 = blahDiv
            elif blahDiv.id == 'blahdiv2':
                blahDiv2 = blahDiv

        assert blahDiv1, 'Failed to find id="blahdiv1" on one of the class="blah" divs'
        assert blahDiv2, 'Failed to find id="blahdiv2" on one of the class="blah" divs'

        assert blahDiv1 in blahDivsCheese, 'Expected id="blahdiv1" div to also be in results from root=cheese div'
        assert blahDiv2 in blahDivsCheese, 'Expected id="blahdiv2" div to also be in results from root=cheese div'
    def getItemsParser(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>""")

        return parser
Exemple #36
0
    def test_valueMethod(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="item" type="text" value="hello" />')

        tag = parser.getElementById('item')
        assert tag.value == 'hello'
    def test_isTagEqual(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classY classX" cheese="gouda">Blah</div>
        </div>
        <div id="sameAttrChildrenSpans">
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classY classX" cheese="gouda">Blah</span>
        </div>
</body></html>''')

        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        helloTagsEq = (helloTag.isTagEqual(hello2Tag))

        assert helloTagsEq is False, "Expected tags with same attribute names but different values (id) to not be equal."

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]
        child3 = sameAttrChildrenEm.children[2]

        assert child1.isTagEqual(
            child2
        ) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(
            child3
        ) is False, "Expected tags with exact same tag name and attributes (but class name in different order) return isTagEqual as False"

        # TODO: Style should compare the same regardless of order

        sameAttrChildrenSpansEm = parser.getElementById(
            'sameAttrChildrenSpans')

        childSpan1 = sameAttrChildrenSpansEm[0]
        childSpan2 = sameAttrChildrenSpansEm[1]
        childSpan3 = sameAttrChildrenSpansEm[2]

        assert childSpan1.isTagEqual(
            childSpan2
        ) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(
            childSpan1
        ) is False, "Expected tags with exact same attributes but different tag name to return isTagEqual as False"

        child1Copy = copy.copy(child1)

        assert child1.isTagEqual(
            child1Copy
        ) is True, "Expected copy of tag to return isTagEqual as True"

        # Do a deep copy so we can change attributes and not affect the former
        child1Copy = copy.copy(child1)

        child1Copy.setAttribute("cheese", "none")

        assert child1.isTagEqual(
            child1Copy
        ) is False, "Expected same tag name same attribute names but different value to return isTagEqual as False"
Exemple #38
0
    def test_firstLastChild(self):
        '''
            test_firstChild - test 
                
                AdvancedTag.firstChild and AdvancedTag.firstElementChild
                AdvancedTag.lastChild and AdvancedTag.lastElementChild
        '''
        document = AdvancedHTMLParser()
        document.parseStr(
            '<div id="main">Hello<div id="two">Blah</div><div id="emptyDiv"></div><div id="three">Three</div>End Text</div>'
        )

        mainEm = document.getElementById('main')

        assert mainEm, "Failed to get element by id='main'"

        assert mainEm.id == 'main', 'Got wrong element for id="main"'

        firstChild = mainEm.firstChild

        assert firstChild == 'Hello', 'Expected .firstChild to return the first block child, str("Hello") but got: %s(%s)' % (
            firstChild.__class__.__name__, repr(firstChild))

        firstChildEm = mainEm.firstElementChild

        assert issubclass(
            firstChildEm.__class__, AdvancedTag
        ), 'Expected firstElementChild to return an AdvancedTag object. Got: ' + firstChildEm.__class__.__name__

        assert firstChildEm.tagName == 'div' and firstChildEm.id == 'two', 'Expected to get div id="two" as firstElementChild. Got: %s(%s)' % (
            firstChildEm.__class__.__name__, repr(firstChildEm))

        lastChild = mainEm.lastChild

        assert lastChild == "End Text", 'Expected .lastChild to return the last block child, str("End Text") but got: %s(%s)' % (
            lastChild.__class__.__name__, repr(lastChild))

        lastChildEm = mainEm.lastElementChild

        assert issubclass(
            lastChildEm.__class__, AdvancedTag
        ), 'Expected lastElementChild to return an AdvancedTag object. Got: ' + lastChildEm.__class__.__name__

        assert lastChildEm.tagName == 'div' and lastChildEm.id == 'three', 'Expected to get div id="three" as lastElementChild. Got: %s(%s)' % (
            lastChildEm.__class__.__name__, repr(lastChildEm))

        emptyDivEm = document.getElementById('emptyDiv')

        assert emptyDivEm, 'Failed to get element by id="emptyDiv"'
        assert emptyDivEm.id == 'emptyDiv', 'Got wrong element for id="emptyDiv"'

        firstChildEmpty = emptyDivEm.firstChild

        assert firstChildEmpty is None, 'Expected empty div .firstChild to be None (null). Got: ' + repr(
            firstChildEmpty)

        firstChildElementEmpty = emptyDivEm.firstElementChild

        assert firstChildElementEmpty is None, 'Expected empty div .firstElementChild to be None (null). Got: ' + repr(
            firstChildElementEmpty)

        lastChildEmpty = emptyDivEm.lastChild

        assert lastChildEmpty is None, 'Expected empty div .lastChild to be None (null). Got: ' + repr(
            lastChildEmpty)

        lastChildElementEmpty = emptyDivEm.lastElementChild

        assert lastChildElementEmpty is None, 'Expected empty div .lastElementChild to be None (null). Got: ' + repr(
            lastChildElementEmpty)
Exemple #39
0
    def test_domAttributes(self):

        parser = AdvancedHTMLParser()

        parser.parseStr(''''<html>
        <body>
            <div id="someDiv" class="one two" align="left">
                <span>Some Child</span>
            </div>

        </body>
    </html>
        ''')

        someDivEm = parser.getElementById('someDiv')

        assert someDivEm, 'Failed to get element by id="someDiv"'

        attributes = someDivEm.attributesDOM

        assert attributes[
            'id'].value == 'someDiv', 'Expected attributes["id"].value to be equal to "someDiv"'

        assert attributes[
            'class'].value == 'one two', "Expected attributes['class'].value to be equal to 'one two'"
        assert attributes[
            'align'].value == 'left', "Expected attributes['align'].value to be equal to 'left'"

        assert attributes[
            'notset'] is None, 'Expected attributes["notset"] to be None'

        assert attributes[
            'id'].ownerElement == someDivEm, 'Expected ownerElement to be "someDivEm"'

        assert attributes[
            'id'].ownerDocument == parser, 'Expected ownerDocument to be parser'

        assert str(
            attributes['id']
        ) == 'id="someDiv"', 'Expected str of attribute to be \'id="someDiv"\' but got: %s' % (
            str(attributes['id']), )

        attributes['align'].value = 'right'

        assert attributes[
            'align'].value == 'right', 'Expected to be able to change attribute value by assigning .value. Failed on "align".'

        assert someDivEm.getAttribute(
            'align'
        ) == 'right', 'Expected that changing a property in the attributes map would change the value in parent element'

        attrNames = []
        for attrName in attributes:
            attrNames.append(attrName)

        assert 'id' in attrNames, 'Expected "id" to be returned from iter on attributes'
        assert 'class' in attrNames, 'Expected "class" to be returned from iter on attributes'
        assert 'align' in attrNames, 'Expected "align" to be returned from iter on attributes'

        clonedAttributes = {
            attrName: attributes[attrName].cloneNode()
            for attrName in attrNames
        }

        for attrName in ('id', 'class', 'align'):
            attrValue = clonedAttributes[attrName].value
            origValue = attributes[attrName].value

            assert attrValue == origValue, 'Expected cloned attribute %s to match original, but did not. (clone) %s != %s (orig)' % (
                attrName, attrValue, origValue)

        assert clonedAttributes[
            'id'].ownerElement is None, 'Expected clone to clear ownerElement'
        assert clonedAttributes[
            'id'].ownerDocument == parser, 'Expected clone to retain same ownerDocument'

        clonedAttributes['align'].value = 'middle'

        assert clonedAttributes[
            'align'].value == 'middle', 'Expected to be able to change value on cloned attribute'
        assert attributes[
            'align'].value == 'right', 'Expected change on clone to not affect original'

        assert someDivEm.getAttribute(
            'align'
        ) == 'right', 'Expected change on clone to not affect element'

        assert attributes.getNamedItem('id') == attributes[
            'id'], 'Expected getNamedItem("id") to be the same as attributes["id"]'