Esempio n. 1
0
    def test_formAttribute(self):
        '''
            test the "form" attribute, that links to parent form
        '''

        document = AdvancedHTMLParser()
        document.parseStr(
            '''<html><head></head><body><div id="main"> <form id="myForm"> <div> <input type="text" id="inputWithinForm" /> </div> </form> </div> <input type="text" id="inputOutsideForm" /> </body></html>'''
        )

        myFormEm = document.getElementById('myForm')

        assert myFormEm, 'Failed to get element by id="myForm"'

        inputWithinFormEm = document.getElementById('inputWithinForm')

        assert inputWithinFormEm, 'Failed to get element with id="inputWithinForm"'

        foundFormEm = inputWithinFormEm.form

        assert foundFormEm, 'Expected inputWithinFormEm.form to return parent form. Got nada.'

        assert foundFormEm is myFormEm, 'Expected to get parent form via .form, got: ' + str(
            foundFormEm.getStartTag())

        inputOutsideFormEm = document.getElementById('inputOutsideForm')

        assert inputOutsideFormEm, 'Failed to get element with id="inputOutsideForm"'

        foundFormEm = inputOutsideFormEm.form

        assert foundFormEm is None, 'Expected .form to return None on an input outside of form. Got: ' + str(
            foundFormEm.getStartTag())
Esempio n. 2
0
    def test_appending(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>""")

        itemsEm = parser.getElementById('items')
        assert itemsEm , 'Expected  to get <div id="outer" '

        assert len(itemsEm.children) == 2 , 'Expected two children'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item3' }
        )

        itemsEm.appendNode(newItem)

        assert parser.getElementById('item3') , 'Expected to get item3 after append'
        assert len(parser.getElementsByName('item')) == 3, 'Expected after append that 3 nodes are  set'
        assert itemsEm.children[2].getAttribute('id') == 'item3' , 'Expected to be third attribute'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item2point5' }
        )

        itemsEm.insertAfter(newItem, itemsEm.children[1])
        childIds = [x.id for x in itemsEm.getElementsByName('item')]

        assert childIds == ['item1', 'item2', 'item2point5', 'item3'] , 'Expected items to be ordered. Got: %s' %(str(childIds,))
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 = AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(
            parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace(
            '\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' % (
            combinedHTML, parsedHTML)
    def test_appending(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        itemsEm = parser.getElementById('items')
        assert itemsEm, 'Expected  to get <div id="outer" '

        assert len(itemsEm.children) == 2, 'Expected two children'

        assert itemsEm.childElementCount == 2, 'Expected childElementCount to equal 2'

        newItem = AdvancedTag('div')
        newItem.setAttributes({'name': 'item', 'id': 'item3'})

        itemsEm.appendNode(newItem)

        assert parser.getElementById(
            'item3'), 'Expected to get item3 after append'
        assert len(parser.getElementsByName(
            'item')) == 3, 'Expected after append that 3 nodes are  set'
        assert itemsEm.children[2].getAttribute(
            'id') == 'item3', 'Expected to be third attribute'

        newItem = AdvancedTag('div')
        newItem.setAttributes({'name': 'item', 'id': 'item2point5'})

        itemsEm.insertAfter(newItem, itemsEm.children[1])
        childIds = [x.id for x in itemsEm.getElementsByName('item')]

        assert childIds == [
            'item1', 'item2', 'item2point5', 'item3'
        ], 'Expected items to be ordered. Got: %s' % (str(childIds, ))
    def test_parsing(self):
        '''
            test_parsing - Test that the parser properly handles several cases of class attribute,
                             and that they are mutable in expected ways thereafter.
        '''

        someHtml = '''<html><body>
        <div class="one two three" id="firstDiv">Some text</div>
        <div id="secondDiv">This one is empty</div>
        <div class="three ZZ AA" id="thirdDiv">Last one</div>
        <div class="" id="emptyClassDiv">Empty</div>
</body></html>'''

        document = AdvancedHTMLParser()
        document.parseStr(someHtml)

        firstDiv = document.getElementById('firstDiv')
        secondDiv = document.getElementById('secondDiv')
        thirdDiv = document.getElementById('thirdDiv')
        emptyClassDiv = document.getElementById('emptyClassDiv')


        assert firstDiv , 'Failed to get element by id="firstDiv"'
        assert secondDiv , 'Failed to get element by id="secondDiv"'
        assert thirdDiv , 'Failed to get element by id="thirdDiv"'
        assert emptyClassDiv , 'Failed to get element by id="emptyClassDiv"'

        firstDivHTML = firstDiv.getHTML()
        secondDivHTML = secondDiv.getHTML()
        thirdDivHTML = thirdDiv.getHTML()
        emptyClassDivHTML = emptyClassDiv.getHTML()

        assert 'class="one two three"' in firstDivHTML , 'Expected string of class to show up in parsed html. Got: ' + firstDivHTML
        assert 'class=' not in secondDivHTML , 'Expected class attribute to not be present when no class set. Got: ' + secondDivHTML
        assert 'class="three ZZ AA"' in thirdDivHTML , 'Expected string of class to show up in parsed html. Got: ' + thirdDivHTML
        assert 'class=' not in emptyClassDivHTML , 'Expected class attribute to not be present when class set to empty in parsed html, i.e. class="". Got: ' + emptyClassDivHTML


        assert firstDiv.className == "one two three" , "Expected parsed className to match 'one two three' Got: " + repr(firstDiv.className)
        assert secondDiv.className == "" , "Expected parsed lack of className to match empty string, \"\" Got: " + repr(secondDiv.className)
        assert thirdDiv.className == "three ZZ AA" , "Expected parsed className to match 'three ZZ AA' Got: " + repr(thirdDiv.className)

        assert emptyClassDiv.className == "" , "Expected parse empty className to remain empty string. Got: " + repr(emptyClassDiv.className)

        assert firstDiv.classList == ["one", "two", "three"] , 'wrong classList'
        assert secondDiv.classList == [] , "wrong classList"
        assert thirdDiv.classList == ["three", "ZZ", "AA"] , "wrong classList"
        assert emptyClassDiv.classList == [] , "Wrong classList"

        # Check that we can modify and it shows up
        firstDiv.setAttribute('class', 'cheese is good')

        firstDivHTML = firstDiv.getHTML()

        assert 'class="cheese is good"' in firstDivHTML , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected in tag attribute. Got: " + firstDivHTML

        assert firstDiv.className == "cheese is good" , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + firstDiv.className

        assert firstDiv.classList == ["cheese", "is", "good"] , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + repr(firstDiv.classList)
    def test_firstLastChild(self):
        '''
            test_firstChild - test

                AdvancedTag.firstChild and AdvancedTag.firstElementChild
                AdvancedTag.lastChild and AdvancedTag.lastElementChild
        '''
        document = AdvancedHTMLParser()
        document.parseStr('<div id="main">Hello<div id="two">Blah</div><div id="emptyDiv"></div><div id="three">Three</div>End Text</div>')


        mainEm = document.getElementById('main')

        assert mainEm , "Failed to get element by id='main'"

        assert mainEm.id == 'main' , 'Got wrong element for id="main"'

        firstChild = mainEm.firstChild

        assert firstChild == 'Hello' , 'Expected .firstChild to return the first block child, str("Hello") but got: %s(%s)' %( firstChild.__class__.__name__, repr(firstChild))

        firstChildEm = mainEm.firstElementChild

        assert issubclass(firstChildEm.__class__, AdvancedTag) , 'Expected firstElementChild to return an AdvancedTag object. Got: ' + firstChildEm.__class__.__name__

        assert firstChildEm.tagName == 'div' and firstChildEm.id == 'two' , 'Expected to get div id="two" as firstElementChild. Got: %s(%s)' %( firstChildEm.__class__.__name__, repr(firstChildEm))

        lastChild = mainEm.lastChild

        assert lastChild == "End Text" , 'Expected .lastChild to return the last block child, str("End Text") but got: %s(%s)' %( lastChild.__class__.__name__, repr(lastChild))

        lastChildEm = mainEm.lastElementChild

        assert issubclass(lastChildEm.__class__, AdvancedTag) , 'Expected lastElementChild to return an AdvancedTag object. Got: ' + lastChildEm.__class__.__name__

        assert lastChildEm.tagName == 'div' and lastChildEm.id == 'three' , 'Expected to get div id="three" as lastElementChild. Got: %s(%s)' %( lastChildEm.__class__.__name__, repr(lastChildEm))


        emptyDivEm = document.getElementById('emptyDiv')

        assert emptyDivEm , 'Failed to get element by id="emptyDiv"'
        assert emptyDivEm.id == 'emptyDiv' , 'Got wrong element for id="emptyDiv"'

        firstChildEmpty = emptyDivEm.firstChild

        assert firstChildEmpty is None , 'Expected empty div .firstChild to be None (null). Got: ' + repr(firstChildEmpty)

        firstChildElementEmpty = emptyDivEm.firstElementChild

        assert firstChildElementEmpty is None , 'Expected empty div .firstElementChild to be None (null). Got: ' + repr(firstChildElementEmpty)

        lastChildEmpty = emptyDivEm.lastChild

        assert lastChildEmpty is None , 'Expected empty div .lastChild to be None (null). Got: ' + repr(lastChildEmpty)

        lastChildElementEmpty = emptyDivEm.lastElementChild

        assert lastChildElementEmpty is None , 'Expected empty div .lastElementChild to be None (null). Got: ' + repr(lastChildElementEmpty)
Esempio n. 7
0
    def test_getAttributesDict(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            '<div id="hello" style="display: none; width: 500px; padding-left: 15px;" class="One Two" data="Yes">Hello</div>'
        )

        helloEm = parser.getElementById('hello')

        assert helloEm.getAttribute('id',
                                    '') == 'hello', 'Got unxpected element'

        attributesDict = helloEm.getAttributesDict()

        assert 'id' in attributesDict, 'Did not find "id" in the attributes dict copy'
        assert 'style' in attributesDict, 'Did not find "style" in the attributes dict copy'
        assert 'class' in attributesDict, 'Did not find "class" in the attributes dict copy'
        assert 'data' in attributesDict, 'Did not find "data" in the attributes dict copy'

        assert len(
            attributesDict.keys()
        ) == 4, 'Got unexpected keys in attributesDict. Only expected "id" "style" "class" and "data", got: "%s"' % (
            repr(attributesDict), )

        assert attributesDict[
            'id'] == 'hello', 'Attribute "id" did not have expected value "hello", got "%s"' % (
                attributesDict['id'], )

        style = StyleAttribute(attributesDict['style'])
        assert style.display == 'none', 'Got unexpected value for display in style copy. Expected "none", got "%s"' % (
            style.display, )
        assert style.width == '500px', 'Got unexpected value for width in style copy. Expected "500px", got "%s"' % (
            style.width, )
        assert style.paddingLeft == '15px', 'Got unexpected value for padding-left. Expected "15px", got "%s"' % (
            style.paddingLeft, )

        assert attributesDict[
            'class'] == 'One Two', 'Got unexpected value for "class" in dict copy. Expected "One Two", Got: "%s"' % (
                attributesDict['class'], )

        assert attributesDict[
            'data'] == 'Yes', 'Got unexpected value for "data" in dict copy, Expected "Yes", Got: "%s"' % (
                attributesDict['data'], )

        # Assert we aren't modifying the original element
        style.paddingTop = '13em'

        assert helloEm.style.paddingTop != '13em', 'Expected getAttributesDict to return copies, but modified original element on "style"'

        attributesDict['class'] += ' Three'

        assert 'Three' not in helloEm.getAttribute(
            'class'
        ), 'Expected getAttributesDict to return copies, but modified original element on "class"'

        attributesDict['id'] = 'zzz'

        assert helloEm.getAttribute(
            'id'
        ) != 'zzz', 'Expected getAttributesDict to return copies, but modified original element on "id"'
    def test_ownerDocument(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")

        outerEm = parser.getElementById('outer')

        assert outerEm.ownerDocument == parser , 'Expected the ownerDocument to be set to parser'

        for element in outerEm.getAllNodes():
            assert element.ownerDocument == parser, 'Expected ownerDocument to be set on every element. Was not set on: %s' %(element.getStartTag(),)


        clonedEm = outerEm.cloneNode()

        assert clonedEm.parentNode is None , 'Expected cloned child to have no parent'
        assert clonedEm.ownerDocument is None , 'Expected cloned child to have no owner document'

        assert len(clonedEm.children) == 0 , 'Expected cloned element to have no children'

        itemsEm = outerEm.removeChild(outerEm.children[0])

        assert itemsEm , 'Expected removeChild to return removed element'

        assert itemsEm.id == 'items' , 'Got wrong element, expected to remove "items", got: %s' %(itemsEm.getStartTag(),)

        assert itemsEm.ownerDocument is None , 'Expected owner document to be set to None after element was removed.'

        for subElement in itemsEm.getAllChildNodes():
            assert subElement.ownerDocument is None, 'Expected owner document to be cleared on all children after removal from document'
Esempio n. 9
0
    def test_attributeDefault(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="item" type="text" value="hello" />')

        tag = parser.getElementById('item')
        assert tag.getAttribute('type', 'bloogity') == 'text'
        assert tag.getAttribute('woogity', 'snoogity') == 'snoogity'
    def test_ownerDocument(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        outerEm = parser.getElementById('outer')

        assert outerEm.ownerDocument == parser, 'Expected the ownerDocument to be set to parser'

        for element in outerEm.getAllNodes():
            assert element.ownerDocument == parser, 'Expected ownerDocument to be set on every element. Was not set on: %s' % (
                element.getStartTag(), )

        clonedEm = outerEm.cloneNode()

        assert clonedEm.parentNode is None, 'Expected cloned child to have no parent'
        assert clonedEm.ownerDocument is None, 'Expected cloned child to have no owner document'

        assert len(clonedEm.children
                   ) == 0, 'Expected cloned element to have no children'

        itemsEm = outerEm.removeChild(outerEm.children[0])

        assert itemsEm, 'Expected removeChild to return removed element'

        assert itemsEm.id == 'items', 'Got wrong element, expected to remove "items", got: %s' % (
            itemsEm.getStartTag(), )

        assert itemsEm.ownerDocument is None, 'Expected owner document to be set to None after element was removed.'

        for subElement in itemsEm.getAllChildNodes():
            assert subElement.ownerDocument is None, 'Expected owner document to be cleared on all children after removal from document'
Esempio n. 11
0
    def test_noValueAttributes(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="thebox" type="checkbox" checked />')

        tag = parser.getElementById('thebox')
        assert tag.hasAttribute('checked')
        assert 'checked' in tag.outerHTML
Esempio n. 12
0
    def test_removeAndContains(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")


        itemsEm = parser.getElementById('items')
        item1Em = parser.getElementById('item1')
        subItem1 = parser.getElementById('subItem1')

        assert itemsEm.hasChild(item1Em) is True, 'Expected itemsEm to have item1Em as a child.'

        assert parser.getElementById('subItem1') is not None, 'Expected to find id=subItem1'

        assert itemsEm.contains(item1Em) , 'Expected itemsEm to contain items1Em'
        assert itemsEm.contains(subItem1) , 'Expected itemsEm to contain subItem1'

        assert subItem1.uid in itemsEm.getAllNodeUids()

        assert parser.contains(item1Em) , 'Expected parser to contain item1Em via contains'
        assert item1Em in parser, 'Expected parser to contain item1Em via in operator'

        assert item1Em.ownerDocument == parser , 'Expected ownerDocument to be set prior to remove'

        # Remove item1 from the tree
        item1Em.remove()

        assert itemsEm.hasChild(item1Em) is False, 'Expected after remove for item1Em to no longer be a child of itemsEm'

        assert parser.getElementById('item1') is None, 'Expected to not be able to find id=item1 after remove'

        assert parser.getElementById('subItem1') is None, 'Expected to not be able to find sub item of id=item1, id=subItem1 after remove.'

        assert item1Em.parentNode is None , 'Expected parentNode on item1Em to be None after remove.'

        assert not itemsEm.contains(item1Em) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.containsUid(item1Em.uid) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.contains(subItem1) , 'Expected itemsEm to not contain subItem1'

        assert subItem1.uid not in itemsEm.getAllNodeUids()

        assert not parser.contains(item1Em) , 'Expected parser to not contain item1Em via contains'
        assert item1Em not in parser, 'Expected parser to not contain item1Em via in operator'

        assert item1Em.ownerDocument is None , 'Expected owner document to be unset upon removal'
Esempio n. 13
0
    def test_removeAndContains(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")


        itemsEm = parser.getElementById('items')
        item1Em = parser.getElementById('item1')
        subItem1 = parser.getElementById('subItem1')

        assert itemsEm.hasChild(item1Em) is True, 'Expected itemsEm to have item1Em as a child.'

        assert parser.getElementById('subItem1') is not None, 'Expected to find id=subItem1'

        assert itemsEm.contains(item1Em) , 'Expected itemsEm to contain items1Em'
        assert itemsEm.contains(subItem1) , 'Expected itemsEm to contain subItem1'

        assert subItem1.uid in itemsEm.getAllNodeUids()

        assert parser.contains(item1Em) , 'Expected parser to contain item1Em via contains'
        assert item1Em in parser, 'Expected parser to contain item1Em via in operator'

        assert item1Em.ownerDocument == parser , 'Expected ownerDocument to be set prior to remove'

        # Remove item1 from the tree
        item1Em.remove()

        assert itemsEm.hasChild(item1Em) is False, 'Expected after remove for item1Em to no longer be a child of itemsEm'

        assert parser.getElementById('item1') is None, 'Expected to not be able to find id=item1 after remove'

        assert parser.getElementById('subItem1') is None, 'Expected to not be able to find sub item of id=item1, id=subItem1 after remove.'

        assert item1Em.parentNode is None , 'Expected parentNode on item1Em to be None after remove.'

        assert not itemsEm.contains(item1Em) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.containsUid(item1Em.uid) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.contains(subItem1) , 'Expected itemsEm to not contain subItem1'

        assert subItem1.uid not in itemsEm.getAllNodeUids()

        assert not parser.contains(item1Em) , 'Expected parser to not contain item1Em via contains'
        assert item1Em not in parser, 'Expected parser to not contain item1Em via in operator'

        assert item1Em.ownerDocument is None , 'Expected owner document to be unset upon removal'
    def test_ParseStr(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
Esempio n. 15
0
    def test_ParseStr(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_HandleMissClose(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MISS_CLOSE)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with missed close')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find id="one"'
        assert oneEm.children[0].innerHTML.strip() == 'Hello' , 'Could not find child tag'
    def test_HandleMultipleRoot(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MULTIPLE_ROOT)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with multiple root nodes')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find first element'
        assert len(parser.getRootNodes()) == 2
Esempio n. 18
0
    def test_ParseFile(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseFile(self.tempFile.name)
        except Exception as e:
            raise AssertionError('Failed to parse file, exception was: %s' %(str(e),))

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data from file parsing'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_ParseFile(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseFile(self.tempFile.name)
        except Exception as e:
            raise AssertionError('Failed to parse file, exception was: %s' %(str(e),))

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data from file parsing'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
Esempio n. 20
0
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 =  AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace('\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' %(combinedHTML, parsedHTML)
Esempio n. 21
0
    def test_cloneNode(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('''
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        ''')

        helloEm = parser.getElementById('hello')

        helloClone = helloEm.cloneNode()

        for attributeName in ('id', 'class', 'cheese'):
            helloEmValue = helloEm.getAttribute(attributeName, None)
            helloCloneValue = helloClone.getAttribute(attributeName, None)
            assert helloEmValue == helloCloneValue, 'Expected cloneNode to return an exact copy, got different %s. %s != %s' %(attributeName, repr(helloEmValue), repr(helloCloneValue))

        assert helloEm.childElementCount == 2 , 'Expected original helloEm to retain two direct children'
        assert helloClone.childElementCount == 0 , 'Expected clone to NOT copy children'
    def test_cloneNode(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('''
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        ''')

        helloEm = parser.getElementById('hello')

        helloClone = helloEm.cloneNode()

        for attributeName in ('id', 'class', 'cheese'):
            helloEmValue = helloEm.getAttribute(attributeName, None)
            helloCloneValue = helloClone.getAttribute(attributeName, None)
            assert helloEmValue == helloCloneValue, 'Expected cloneNode to return an exact copy, got different %s. %s != %s' % (
                attributeName, repr(helloEmValue), repr(helloCloneValue))

        assert helloEm.childElementCount == 2, 'Expected original helloEm to retain two direct children'
        assert helloClone.childElementCount == 0, 'Expected clone to NOT copy children'
    def test_hiddenAttr(self):
        '''
            Test that the "hidden" attribute works correctly.
        '''
        myHTML = '''<html> <input hidden value="hello" id="abc" />'''

        parser = AdvancedHTMLParser()

        parser.parseStr(myHTML)

        idEm = parser.getElementById('abc')

        assert idEm.hidden == True

        assert 'hidden' in str(idEm)

        # Make sure we treat this as a real binary attribute
        x = str(idEm)
        assert 'hidden=' not in str(idEm)

        assert idEm.getAttribute('hidden') is True
Esempio n. 24
0
    def test_valueMethod(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="item" type="text" value="hello" />')

        tag = parser.getElementById('item')
        assert tag.value == 'hello'
    def test_isTagEqual(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classY classX" cheese="gouda">Blah</div>
        </div>
        <div id="sameAttrChildrenSpans">
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classY classX" cheese="gouda">Blah</span>
        </div>
</body></html>''')

        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        helloTagsEq = (helloTag.isTagEqual(hello2Tag))

        assert helloTagsEq is False, "Expected tags with same attribute names but different values (id) to not be equal."

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]
        child3 = sameAttrChildrenEm.children[2]

        assert child1.isTagEqual(
            child2
        ) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(
            child3
        ) is False, "Expected tags with exact same tag name and attributes (but class name in different order) return isTagEqual as False"

        # TODO: Style should compare the same regardless of order

        sameAttrChildrenSpansEm = parser.getElementById(
            'sameAttrChildrenSpans')

        childSpan1 = sameAttrChildrenSpansEm[0]
        childSpan2 = sameAttrChildrenSpansEm[1]
        childSpan3 = sameAttrChildrenSpansEm[2]

        assert childSpan1.isTagEqual(
            childSpan2
        ) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(
            childSpan1
        ) is False, "Expected tags with exact same attributes but different tag name to return isTagEqual as False"

        child1Copy = copy.copy(child1)

        assert child1.isTagEqual(
            child1Copy
        ) is True, "Expected copy of tag to return isTagEqual as True"

        # Do a deep copy so we can change attributes and not affect the former
        child1Copy = copy.copy(child1)

        child1Copy.setAttribute("cheese", "none")

        assert child1.isTagEqual(
            child1Copy
        ) is False, "Expected same tag name same attribute names but different value to return isTagEqual as False"
    def test_tagOperators(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
        </div>
</body></html>''')

        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        tagsEq = (helloTag == hello2Tag)

        assert tagsEq is False, "Expected different tags with same attributes names to not be =="

        tagsNe = (helloTag != hello2Tag)

        assert tagsNe is True, "Expected different tags with same attributes names to be !="

        sameTagEq = (helloTag == helloTag)

        assert sameTagEq is True, "Expected same tag to == itself"

        diffTagsEq = (helloTag == goodbyeTag)

        assert diffTagsEq is False, "Expected different tags with different attributes to not be =="

        diffTagsNe = (helloTag != goodbyeTag)

        assert diffTagsNe is True, "Expected different tags with different attributes to be !="

        helloTagCopy = copy.copy(helloTag)

        copyEq = (helloTag == helloTagCopy)

        assert copyEq is False, "Expected copy of tag to not == original"

        copyNe = (helloTag != helloTagCopy)

        assert copyNe is True, "Expected copy of tag to != original"

        helloTagCopyRecon = AdvancedTag(helloTag.tagName,
                                        helloTag.getAttributesList(),
                                        helloTag.isSelfClosing)

        copyEq = (helloTag == helloTagCopyRecon)

        assert copyEq is False, "Expected reconstruction of tag to not == original"

        copyNe = (helloTag != helloTagCopyRecon)

        assert copyNe is True, "Expected reconstruction of tag to != original"

        helloTagFetch2 = parser.getElementById('hello')

        fetchEq = (helloTag == helloTagFetch2)

        assert fetchEq is True, "Expected fetching the same tag is =="

        fetchNe = (helloTag != helloTagFetch2)

        assert fetchNe is False, "Expected fetching the same tag to not be !="

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]

        childrenEq = (child1 == child2)

        assert childrenEq is False, "Expected elements with exact same attributes and values but different individual tags to not be =="

        childrenNe = (child1 != child2)

        assert childrenNe is True, "Expected elements with exact same attributes and values but different individual tags to be !="
Esempio n. 27
0
    def test_tagOperators(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        tagsEq = ( helloTag == hello2Tag )

        assert tagsEq is False , "Expected different tags with same attributes names to not be =="

        tagsNe = ( helloTag != hello2Tag )

        assert tagsNe is True, "Expected different tags with same attributes names to be !="

        sameTagEq = ( helloTag == helloTag )

        assert sameTagEq is True, "Expected same tag to == itself"

        diffTagsEq = (helloTag == goodbyeTag)

        assert diffTagsEq is False, "Expected different tags with different attributes to not be =="

        diffTagsNe = (helloTag != goodbyeTag)

        assert diffTagsNe is True, "Expected different tags with different attributes to be !="

        helloTagCopy = copy.copy(helloTag)

        copyEq = (helloTag == helloTagCopy)

        assert copyEq is False, "Expected copy of tag to not == original"

        copyNe = (helloTag != helloTagCopy)

        assert copyNe is True, "Expected copy of tag to != original"

        helloTagCopyRecon = AdvancedTag(helloTag.tagName, helloTag.getAttributesList(), helloTag.isSelfClosing)

        copyEq = (helloTag == helloTagCopyRecon)

        assert copyEq is False , "Expected reconstruction of tag to not == original"

        copyNe = (helloTag != helloTagCopyRecon)

        assert copyNe is True, "Expected reconstruction of tag to != original"

        helloTagFetch2 = parser.getElementById('hello')

        fetchEq = (helloTag == helloTagFetch2)

        assert fetchEq is True, "Expected fetching the same tag is =="

        fetchNe = (helloTag != helloTagFetch2)

        assert fetchNe is False, "Expected fetching the same tag to not be !="

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]

        childrenEq = (child1 == child2)

        assert childrenEq is False, "Expected elements with exact same attributes and values but different individual tags to not be =="

        childrenNe = (child1 != child2)

        assert childrenNe is True, "Expected elements with exact same attributes and values but different individual tags to be !="
Esempio n. 28
0
    def test_isTagEqual(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classY classX" cheese="gouda">Blah</div>
        </div>
        <div id="sameAttrChildrenSpans">
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classY classX" cheese="gouda">Blah</span>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'


        helloTagsEq = (helloTag.isTagEqual(hello2Tag))

        assert helloTagsEq is False, "Expected tags with same attribute names but different values (id) to not be equal."

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]
        child3 = sameAttrChildrenEm.children[2]

        assert child1.isTagEqual(child2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(child3) is False, "Expected tags with exact same tag name and attributes (but class name in different order) return isTagEqual as False"

        # TODO: Style should compare the same regardless of order

        sameAttrChildrenSpansEm = parser.getElementById('sameAttrChildrenSpans')

        childSpan1 = sameAttrChildrenSpansEm[0]
        childSpan2 = sameAttrChildrenSpansEm[1]
        childSpan3 = sameAttrChildrenSpansEm[2]

        assert childSpan1.isTagEqual(childSpan2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(childSpan1) is False, "Expected tags with exact same attributes but different tag name to return isTagEqual as False"

        child1Copy = copy.copy(child1)

        assert child1.isTagEqual(child1Copy) is True, "Expected copy of tag to return isTagEqual as True"

        # Do a deep copy so we can change attributes and not affect the former
        child1Copy = copy.copy(child1)

        child1Copy.setAttribute("cheese", "none")

        assert child1.isTagEqual(child1Copy) is False, "Expected same tag name same attribute names but different value to return isTagEqual as False"
    def test_parsing(self):
        '''
            test_parsing - Test that the parser properly handles several cases of class attribute,
                             and that they are mutable in expected ways thereafter.
        '''

        someHtml = '''<html><body>
        <div class="one two three" id="firstDiv">Some text</div>
        <div id="secondDiv">This one is empty</div>
        <div class="three ZZ AA" id="thirdDiv">Last one</div>
        <div class="" id="emptyClassDiv">Empty</div>
</body></html>'''

        document = AdvancedHTMLParser()
        document.parseStr(someHtml)

        firstDiv = document.getElementById('firstDiv')
        secondDiv = document.getElementById('secondDiv')
        thirdDiv = document.getElementById('thirdDiv')
        emptyClassDiv = document.getElementById('emptyClassDiv')

        assert firstDiv, 'Failed to get element by id="firstDiv"'
        assert secondDiv, 'Failed to get element by id="secondDiv"'
        assert thirdDiv, 'Failed to get element by id="thirdDiv"'
        assert emptyClassDiv, 'Failed to get element by id="emptyClassDiv"'

        firstDivHTML = firstDiv.getHTML()
        secondDivHTML = secondDiv.getHTML()
        thirdDivHTML = thirdDiv.getHTML()
        emptyClassDivHTML = emptyClassDiv.getHTML()

        assert 'class="one two three"' in firstDivHTML, 'Expected string of class to show up in parsed html. Got: ' + firstDivHTML
        assert 'class=' not in secondDivHTML, 'Expected class attribute to not be present when no class set. Got: ' + secondDivHTML
        assert 'class="three ZZ AA"' in thirdDivHTML, 'Expected string of class to show up in parsed html. Got: ' + thirdDivHTML
        assert 'class=' not in emptyClassDivHTML, 'Expected class attribute to not be present when class set to empty in parsed html, i.e. class="". Got: ' + emptyClassDivHTML

        assert firstDiv.className == "one two three", "Expected parsed className to match 'one two three' Got: " + repr(
            firstDiv.className)
        assert secondDiv.className == "", "Expected parsed lack of className to match empty string, \"\" Got: " + repr(
            secondDiv.className)
        assert thirdDiv.className == "three ZZ AA", "Expected parsed className to match 'three ZZ AA' Got: " + repr(
            thirdDiv.className)

        assert emptyClassDiv.className == "", "Expected parse empty className to remain empty string. Got: " + repr(
            emptyClassDiv.className)

        assert firstDiv.classList == ["one", "two", "three"], 'wrong classList'
        assert secondDiv.classList == [], "wrong classList"
        assert thirdDiv.classList == ["three", "ZZ", "AA"], "wrong classList"
        assert emptyClassDiv.classList == [], "Wrong classList"

        # Check that we can modify and it shows up
        firstDiv.setAttribute('class', 'cheese is good')

        firstDivHTML = firstDiv.getHTML()

        assert 'class="cheese is good"' in firstDivHTML, "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected in tag attribute. Got: " + firstDivHTML

        assert firstDiv.className == "cheese is good", "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + firstDiv.className

        assert firstDiv.classList == [
            "cheese", "is", "good"
        ], "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + repr(
            firstDiv.classList)
Esempio n. 30
0
    def test_getAttributesList(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            '<div id="hello" style="display: none; width: 500px; padding-left: 15px;" class="One Two" data="Yes">Hello</div>'
        )

        helloEm = parser.getElementById('hello')

        assert helloEm.getAttribute('id',
                                    '') == 'hello', 'Got unxpected element'

        attributesList = helloEm.getAttributesList()

        foundId = False
        foundStyle = False
        foundClass = False
        foundData = False

        for attrName, attrValue in attributesList:
            if attrName == 'id':
                assert attrValue == 'hello', 'Attribute "id" did not have expected value "hello", got "%s"' % (
                    attrValue, )

                foundId = True
            elif attrName == 'style':

                style = StyleAttribute(attrValue)
                assert style.display == 'none', 'Got unexpected value for display in style copy. Expected "none", got "%s"' % (
                    style.display, )
                assert style.width == '500px', 'Got unexpected value for width in style copy. Expected "500px", got "%s"' % (
                    style.width, )
                assert style.paddingLeft == '15px', 'Got unexpected value for padding-left. Expected "15px", got "%s"' % (
                    style.paddingLeft, )

                foundStyle = True
            elif attrName == 'class':

                assert attrValue == 'One Two', 'Expected class name to equal "One Two", got: %s' % (
                    attrValue, )

                foundClass = True
            elif attrName == 'data':

                assert attrValue == 'Yes', 'Expected attribute "data" to have the value "Yes", got: %s' % (
                    attrValue, )

                foundData = True

            else:
                raise AssertionError(
                    'Got unexpected attribute in copy: (%s, %s)' %
                    (attrName, attrValue))

        assert foundId is True, 'Did not find id element in attribute list'
        assert foundStyle is True, 'Did not find style element in attribute list'
        assert foundClass is True, 'Did not find class element in attribute list'
        assert foundData is True, 'Did not find data element in attribute list'

        # Test that we have a COPY, not the originals

        for item in attributesList:
            if item[0] == 'style':
                # Just incase in the future we want to include a StyleAttribute instead of the str
                if not isinstance(item[1], StyleAttribute):
                    style = StyleAttribute(item[1])
                else:
                    style = item[1]
                style.paddingTop = '10px'

        # These should not be modified in the original element
        assert 'padding-top' not in str(helloEm.style)
Esempio n. 31
0
    def test_domAttributes(self):

        parser = AdvancedHTMLParser()

        parser.parseStr(''''<html>
        <body>
            <div id="someDiv" class="one two" align="left">
                <span>Some Child</span>
            </div>

        </body>
    </html>
        ''')

        someDivEm = parser.getElementById('someDiv')

        assert someDivEm, 'Failed to get element by id="someDiv"'

        attributes = someDivEm.attributesDOM

        assert attributes[
            'id'].value == 'someDiv', 'Expected attributes["id"].value to be equal to "someDiv"'

        assert attributes[
            'class'].value == 'one two', "Expected attributes['class'].value to be equal to 'one two'"
        assert attributes[
            'align'].value == 'left', "Expected attributes['align'].value to be equal to 'left'"

        assert attributes[
            'notset'] is None, 'Expected attributes["notset"] to be None'

        assert attributes[
            'id'].ownerElement == someDivEm, 'Expected ownerElement to be "someDivEm"'

        assert attributes[
            'id'].ownerDocument == parser, 'Expected ownerDocument to be parser'

        assert str(
            attributes['id']
        ) == 'id="someDiv"', 'Expected str of attribute to be \'id="someDiv"\' but got: %s' % (
            str(attributes['id']), )

        attributes['align'].value = 'right'

        assert attributes[
            'align'].value == 'right', 'Expected to be able to change attribute value by assigning .value. Failed on "align".'

        assert someDivEm.getAttribute(
            'align'
        ) == 'right', 'Expected that changing a property in the attributes map would change the value in parent element'

        attrNames = []
        for attrName in attributes:
            attrNames.append(attrName)

        assert 'id' in attrNames, 'Expected "id" to be returned from iter on attributes'
        assert 'class' in attrNames, 'Expected "class" to be returned from iter on attributes'
        assert 'align' in attrNames, 'Expected "align" to be returned from iter on attributes'

        clonedAttributes = {
            attrName: attributes[attrName].cloneNode()
            for attrName in attrNames
        }

        for attrName in ('id', 'class', 'align'):
            attrValue = clonedAttributes[attrName].value
            origValue = attributes[attrName].value

            assert attrValue == origValue, 'Expected cloned attribute %s to match original, but did not. (clone) %s != %s (orig)' % (
                attrName, attrValue, origValue)

        assert clonedAttributes[
            'id'].ownerElement is None, 'Expected clone to clear ownerElement'
        assert clonedAttributes[
            'id'].ownerDocument == parser, 'Expected clone to retain same ownerDocument'

        clonedAttributes['align'].value = 'middle'

        assert clonedAttributes[
            'align'].value == 'middle', 'Expected to be able to change value on cloned attribute'
        assert attributes[
            'align'].value == 'right', 'Expected change on clone to not affect original'

        assert someDivEm.getAttribute(
            'align'
        ) == 'right', 'Expected change on clone to not affect element'

        assert attributes.getNamedItem('id') == attributes[
            'id'], 'Expected getNamedItem("id") to be the same as attributes["id"]'