Example #1
0
    def test_absimgsrc(self):
        html1_in = '''<div>
<p>Hi there.</p>
<img src="http://foo.example.com/path/to/bananas.jpg" alt="yellow fruit" width="23" height="480">
<img src="/_mwu/bananatree.jpg" alt="where they come from">
<p>Here's some more.</p>
<img src="/fruitpics/strawberry.jpg" alt="berry good">
<p>and then:
<img src="standard/nrolling-kiwi.gif" alt="delicious but takes time to peel">
<img src=" alt="GIF data URL"/>
<img src="DATA:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAkCAMAAACpD3pbAAAAYFBMVEVZbYftuWNoP0iYbFJDHkBcNEWAVk3JmFzVo17hrmG8jVmQk5+8sJOwgldse4xPKUN0S0r3yXdLJULcwpI4FD75x2f5yniSmJ07Fz/WpF6MYU/wyoShn5XTvZT84bL5xGZPqaqJAAAAdUlEQVQ4y+XTRw7EQAgEQJg8jmtvdOz//9IvaE57c19LQoBAvjuNiMgJmkdrMl69yXjbfNyFtfMN5zo4P0RlPCYAzjN+KoDsGIcKoKGcRkAjLb78JhcS7TyuJRc6WIrWWrqgFodsLrXc95j+yq3Jm/X+n7mXC9defIzz7p9PAAAAAElFTkSuQmCC" alt="PNG data URL"/>
<img src="" alt="Pathological HTML!">
</p>
</div>'''
        html1_out = '''<div>
<p>Hi there.</p>
<img src="http://foo.example.com/path/to/bananas.jpg" alt="yellow fruit" width="23" height="480">
<img src="/_mwu/bananatree.jpg" alt="where they come from">
<p>Here's some more.</p>
<img src="http://desktop.example.com/fruitpics/strawberry.jpg" alt="berry good">
<p>and then:
<img src="http://desktop.example.com/articles/standard/nrolling-kiwi.gif" alt="delicious but takes time to peel">
<img src=" alt="GIF data URL"/>
<img src="DATA:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAkCAMAAACpD3pbAAAAYFBMVEVZbYftuWNoP0iYbFJDHkBcNEWAVk3JmFzVo17hrmG8jVmQk5+8sJOwgldse4xPKUN0S0r3yXdLJULcwpI4FD75x2f5yniSmJ07Fz/WpF6MYU/wyoShn5XTvZT84bL5xGZPqaqJAAAAdUlEQVQ4y+XTRw7EQAgEQJg8jmtvdOz//9IvaE57c19LQoBAvjuNiMgJmkdrMl69yXjbfNyFtfMN5zo4P0RlPCYAzjN+KoDsGIcKoKGcRkAjLb78JhcS7TyuJRc6WIrWWrqgFodsLrXc95j+yq3Jm/X+n7mXC9defIzz7p9PAAAAAElFTkSuQmCC" alt="PNG data URL"/>
<img src="" alt="Pathological HTML!">
</p>
</div>'''
        desktop_url = 'http://desktop.example.com/articles/delicious.html'
        from mobilize.filters.misc import absimgsrc
        elem = html.fromstring(html1_in)
        absimgsrc(elem, desktop_url)
        result = elem2str(elem)
        self.assertSequenceEqual(normxml(html1_out), normxml(result))
Example #2
0
 def test_GoogleAnalytics_none(self):
     # Check negative case where we expect to not find GA tracking codes
     from mobilize.components import GoogleAnalytics
     doc_str = open(data_file_path('whole-html', 'cnn.html')).read()
     doc = html.fromstring(doc_str)
     noga = GoogleAnalytics()
     noga.extract(doc)
     noga.process()
     actual = normxml(noga.html())
     expected = normxml('''<div class="mwu-elem" id="mwu-elem-ga"></div>''')
     self.assertSequenceEqual(expected, actual)
Example #3
0
    def test_noattribs(self):
        ELEMSTR1 = '''<table width="600" style="color: fuscia;">
<tr><td width="200">one</td><td>two</td></tr>
<tr><td>three</td><td>four</td></tr>
</table>
'''
        ELEMSTR2 = '''<div>
<table width="600" style="color: fuscia;">
<tr><td width="200">one</td><td>two</td></tr>
<tr><td>three</td><td>four</td></tr>
</table>
</div>
'''
        testdata = [
            {'in_str' : ELEMSTR1,
             'tags' : ['table'],
             'attribs' : ['width', 'style'],
             'out_str' : '''<table>
<tr><td width="200">one</td><td>two</td></tr>
<tr><td>three</td><td>four</td></tr>
</table>
'''
             },
            {'in_str' : ELEMSTR2,
             'tags' : ['table'],
             'attribs' : ['width', 'style'],
             'out_str' : '''<div>
<table>
<tr><td width="200">one</td><td>two</td></tr>
<tr><td>three</td><td>four</td></tr>
</table>
</div>
'''
             },
            {'in_str' : ELEMSTR2,
             'tags' : ['table', 'td'],
             'attribs' : ['width', 'style'],
             'out_str' : '''<div>
<table>
<tr><td>one</td><td>two</td></tr>
<tr><td>three</td><td>four</td></tr>
</table>
</div>
'''
             },
            ]
        from mobilize.filters import noattribs
        for ii, td in enumerate(testdata):
            elem = html.fragment_fromstring(td['in_str'], create_parent=False)
            noattribs(elem, td['tags'], td['attribs'])
            expected = normxml(td['out_str'])
            actual = normxml(elem2str(elem))
            self.assertSequenceEqual(expected, actual)
Example #4
0
    def test_collapse(self):
        '''
        Test for collapsing filter application mode
        '''
        from mobilize.components import (
            XPath,
            FILT_EACHELEM,
            FILT_COLLAPSED,
            )
        def testfilter(elem):
            if elem.tag == 'a':
                elem.attrib['class'] = 'foo'
            for ii, child in enumerate(elem):
                if 'a' == child.tag:
                    child.attrib['id'] = 'child-%d' % ii
            
        htmlstr1 = '''<a href="/">a</a>
<a href="/">b</a>
<a href="/">c</a>
'''
        nocollapse = XPath('//a', postfilters=[testfilter], filtermode=FILT_EACHELEM)
        nocollapse.extract(html.fromstring(htmlstr1))
        actual = nocollapse.process('idname')
        actual_str = html.tostring(actual)
        expected_str = '''<div class="mwu-elem" id="idname">
<a href="/" class="foo">a</a>
<a href="/" class="foo">b</a>
<a href="/" class="foo">c</a>
</div>
'''
        self.assertSequenceEqual(normxml(expected_str), normxml(actual_str))
        
        expected_str = '''<div id="idname">
<a href="/">a</a>
<a href="/">b</a>
<a href="/">c</a>
</div>
'''
        collapse = XPath('//a', postfilters=[testfilter], filtermode=FILT_COLLAPSED)
        collapse.extract(html.fromstring(htmlstr1))
        actual = collapse.process('idname')
        actual_str = html.tostring(actual)
        expected_str = '''<div class="mwu-elem" id="idname">
<a href="/" id="child-0">a</a>
<a href="/" id="child-1">b</a>
<a href="/" id="child-2">c</a>
</div>
'''
        self.assertSequenceEqual(normxml(expected_str), normxml(actual_str))
Example #5
0
    def test_extract_csspath(self):
        from mobilize.components import CssPath

        testdata = [
            {'datafile' : 'a.xml',
             'components' : [CssPath('div#happy', classvalue='some-class')],
             'extracted' : ['<div class="some-class" id="some-id"><div id="happy">lucky</div></div>'],
             },
            {'datafile' : 'b.xml',
             'components' : [CssPath('div#joyful', classvalue='some-class')],
             'extracted' : ['<div class="some-class" id="some-id"><div id="joyful">fun</div></div>'],
             },
            {'datafile' : 'c.xml',
             'components' : [CssPath('p.graceful', classvalue='some-class')],
             'extracted' : ['<div class="some-class" id="some-id"><p class="graceful">laughing</p></div>'],
             },
            {'datafile' : 'd.xml',
             'components' : [CssPath('p.graceful', classvalue='some-class')],
             'extracted' : ['<div class="some-class" id="some-id"><p class="skipping graceful enthusiastic">laughing</p></div>'],
             },
            {'datafile' : 'e.xml',
             'components' : [CssPath('p.graceful', classvalue='some-class')],
             'extracted' : ['<div class="some-class" id="some-id"><p class="skipping graceful enthusiastic">laughing</p><p class="graceful">enthusiastic</p></div>'],
             },
            ]
        for ii, td in enumerate(testdata):
            doc = html.fromstring(open(data_file_path('extract_celems', td['datafile'])).read())
            for sel in td['components']:
                sel.extract(doc)
                sel.process('some-id')
            expected = list(map(normxml, td['extracted']))
            actual = [normxml(sel.html()) for sel in td['components']]
            msg = 'e: %s, a: %s [%d %s]' % (expected, actual, ii, td['datafile'])
            self.assertEqual(expected, actual, msg)
Example #6
0
    def test_relhyperlinks(self):
        from mobilize.filters import (
            relhyperlinks,
            relhyperlinks_full,
            )
        htmlA='''
<div>
<ul>
  <li><a href="http://alpha.com">Alpha home page</a></li>
  <li><a href="http://www.alpha.com">Alt Alpha home page</a></li>
  <li><a href="http://beta.com">Beta home page</a></li>
</ul>

<p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="http://alpha.com/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="https://alpha.com/secure/about/drinks/coffee">coffee</a>.</p> </div>
'''

        root1 = html.fromstring(htmlA)
        relhyperlinks(root1, 'alpha.com')
        actual1 = html.tostring(root1)
        expected1 = '''
<div>
<ul>
  <li><a href="/">Alpha home page</a></li>
  <li><a href="http://www.alpha.com">Alt Alpha home page</a></li>
  <li><a href="http://beta.com">Beta home page</a></li>
</ul>

<p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="https://alpha.com/secure/about/drinks/coffee">coffee</a>.</p> </div>
'''
        self.assertSequenceEqual(normxml(expected1), normxml(actual1))

        root2 = html.fromstring(htmlA)
        relhyperlinks_full(root2, ['alpha.com', 'www.alpha.com'], ['http', 'https'])
        actual2 = html.tostring(root2)
        expected2 = '''
<div>
<ul>
  <li><a href="/">Alpha home page</a></li>
  <li><a href="/">Alt Alpha home page</a></li>
  <li><a href="http://beta.com">Beta home page</a></li>
</ul>

<p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="/secure/about/drinks/coffee">coffee</a>.</p> </div>
'''
        self.assertSequenceEqual(normxml(expected2), normxml(actual2))
Example #7
0
    def test_style(self):
        '''test that style attribute is set properly'''
        style = 'background-color: red; font-size: large;'
        sourcestr = '''<ul>
  <li>Dre</li>
  <li>Snoop</li>
  <li>Thug Life</li>
</ul>'''
        extracted = DirectExtracted('', style=style)
        extracted._sourcestr = sourcestr
        extracted.extract(html.fromstring(sourcestr))
        extracted.process('foo')
        rendered = extracted.elem
        # verify that the first child is the source string...
        firstchild_elem = rendered[0]
        self.assertSequenceEqual(normxml(sourcestr), normxml(html.tostring(firstchild_elem)))
        # check the style attribute
        self.assertEqual(style, rendered.attrib['style'])
Example #8
0
    def test_select_multiple(self):
        '''
        Test that extracted components can accept multiple selectors
        '''
        from mobilize.components import CssPath, XPath
        selectors = [
            'nav',
            'section',
            ]
        src_html = '''<div>
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
<table><tr><td>&nbsp;</td><td>I'm using tables for layout!!! DUR</td></tr></table>
<section>
<p>Hello.</p>
</section>
</div>
'''
        expected_html = '''<div class="mwu-elem" id="foo">
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
<section>
<p>Hello.</p>
</section>
</div>'''
        # test for CssPath
        css_component = CssPath(selectors, idname='foo')
        css_component.extract(html.fromstring(src_html))
        extracted = css_component.process()
        extracted_str = html.tostring(extracted)
        self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))

        # test for XPath
        x_component = XPath(selectors, idname='foo')
        x_component.extract(html.fromstring(src_html))
        extracted = x_component.process()
        extracted_str = html.tostring(extracted)
        self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))
Example #9
0
    def test_squeezebr(self):
        from mobilize.filters import squeezebr
        testdata = [
            {'in_str' : '''<p>Hi.</p>''',
             'out_str' : '''<p>Hi.</p>''',
             },
            {'in_str' : '''<p>Hi.<br>Hey.</p>''',
             'out_str' : '''<p>Hi.<br>Hey.</p>''',
             },
            {'in_str' : '''<p>Hi.<br><br>Hey.</p>''',
             'out_str' : '''<p>Hi.<br>Hey.</p>''',
             },
            {'in_str' : '''<p>Hi.<br/><br/><br/><br/><br/><br/><br/><br/><br/>Hey.</p>''',
             'out_str' : '''<p>Hi.<br>Hey.</p>''',
             },
            {'in_str' : '''<div>
<p>Hi.<br><br>Hey.</p>
<p>This is some more text
<br><br><br><br><br><img src="foo.png" alt="foo"/>
</p>
</div>''',
             'out_str' : '''<div>
<p>Hi.<br>Hey.</p>
<p>This is some more text
<br><img src="foo.png" alt="foo">
</p>
</div>
''',
             },
            {'in_str' : '''<p>Hi.<br>    <br>Hey.</p>''',
             'out_str' : '''<p>Hi.<br>Hey.</p>''',
             },
            {'in_str' : '''<p>Hi.<br>How.<br>Hey.</p>''',
             'out_str' : '''<p>Hi.<br>How.<br>Hey.</p>''',
             },
            ]
        for ii, td in enumerate(testdata):
            elem = html.fragment_fromstring(td['in_str'], create_parent=False)
            squeezebr(elem)
            expected = normxml(td['out_str'])
            actual = normxml(elem2str(elem))
            self.assertSequenceEqual(expected, actual)
Example #10
0
    def test_abslinkfilesrc(self):
        from mobilize.filters import abslinkfilesrc
        html_in = '''<div>
    <p><a href="marketstudy.xls">Market Study</a></p>
    <p><a href="/whitepapers/fill-in-blank.doc">Make your own white paper!</a></p>
    <p><a href="/whitepapers/widgets.pdf">Widget White Paper</a></p>
    <p><a href="">HTML Pathology 101</a></p>
</div>'''
        html_out ='''<div>
    <p><a href="http://example.com/about/marketstudy.xls">Market Study</a></p>
    <p><a href="/whitepapers/fill-in-blank.doc">Make your own white paper!</a></p>
    <p><a href="http://example.com/whitepapers/widgets.pdf">Widget White Paper</a></p>
    <p><a href="">HTML Pathology 101</a></p>
</div>'''
        desktop_url = 'http://example.com/about/papers.html'
        extensions=['.xls', '.pdf']

        elem = html.fromstring(html_in)
        abslinkfilesrc(elem, desktop_url, extensions)
        result = html.tostring(elem)
        self.assertSequenceEqual(normxml(html_out), normxml(result))
Example #11
0
    def test_resizeiframe(self):
        from mobilize.filters import resizeiframe
        testdata = [
            {'iframe_str' : '''<p>
<iframe width="533" height="330" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>
</p>''',
             'resized_str' : '''<p>
<iframe width="280" height="173" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>
</p>''',
             },
            {'iframe_str' : '''<iframe width="533" height="330" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>''',
             'resized_str' : '''<iframe width="280" height="173" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>''',
             },
            {'iframe_str' : '''<p>Nothing to see here.</p>''',
             'resized_str' : '''<p>Nothing to see here.</p>''',
             },
            ]
        for ii, td in enumerate(testdata):
            iframe_elem = html.fragment_fromstring(td['iframe_str'], create_parent=False)
            resizeiframe(iframe_elem)
            self.assertSequenceEqual(normxml(td['resized_str']), normxml(elem2str(iframe_elem)))
Example #12
0
    def test_innerhtml(self):
        from mobilize.components import XPath
        html_str = '''<table><tr><td>Hello</td></tr></table>'''
        # test for innerhtml=False
        component_f = XPath('//td', idname='foo', innerhtml=False)
        component_f.extract(html.fromstring(html_str))
        extracted = component_f.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>'
        e = normxml(expected)
        a = normxml(extracted_str)
        self.assertSequenceEqual(e, a)
        
        # test for innerhtml=True
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring(html_str))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo">Hello</div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
        
        # test for ineffectiveness of innerhtml=True with multiple matching elements
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring('''
<table><tr>
<td>Hello</td>
<td>Goodbye</td>
</tr></table>
'''))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
Example #13
0
    def test_formcontroltypes(self):
        from mobilize.filters import formcontroltypes
        instr = '''<form>
<dl>
<dt>Name</dt>
<dd><input type="text" name="name"/></dd>
<dt>Email</dt>
<dd><input type="email" name="email"/></dd>
<dt>Favorite color</dt>
<dd>
  <ul>
    <li><input type="radio" name="color" value="red" class="nonstandard"/>Red</li>
    <li><input type="radio" name="color" value="blue" class="nonstandard"/>Blue</li>
    <li><input type="radio" name="color" value="green" class="nonstandard"/>Green</li>
  </ul>
</dd>
</dl>
</form>'''
        
        expected = '''<form>
<dl>
<dt>Name</dt>
<dd><input type="text" name="name" class="mwu-fc-input-text"/></dd>
<dt>Email</dt>
<dd><input type="email" name="email" class="mwu-fc-input-email"/></dd>
<dt>Favorite color</dt>
<dd>
  <ul>
    <li><input type="radio" name="color" value="red" class="nonstandard mwu-fc-input-radio"/>Red</li>
    <li><input type="radio" name="color" value="blue" class="nonstandard mwu-fc-input-radio"/>Blue</li>
    <li><input type="radio" name="color" value="green" class="nonstandard mwu-fc-input-radio"/>Green</li>
  </ul>
</dd>
</dl>
</form>'''
        root_elem = html.fromstring(instr)
        formcontroltypes(root_elem)
        actual = html.tostring(root_elem)
        self.assertSequenceEqual(normxml(expected), normxml(actual))
Example #14
0
 def test_formaction(self):
     from mobilize.filters import formaction
     testdata = [
         {'form_html_in'  : '''<div><form action="http://example.com/foo/" ><input type="text" name="bar"></form></div>''',
          'form_html_out' : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''',
          },
         {'form_html_in'  : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''',
          'form_html_out' : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''',
          },
         {'form_html_in'  : '''<div><form action="http://example.com/foo/" ><input type="text" name="bar"></form></div>''',
          'urlprefix'     : 'https://mobilewebup.com/',
          'form_html_out' : '''<div><form action="https://mobilewebup.com/foo/" ><input type="text" name="bar"></form></div>''',
          },
         ]
     for ii, td in enumerate(testdata):
         elem = html.fromstring(td['form_html_in'])
         if 'urlprefix' in td:
             formaction(elem, td['urlprefix'])
         else:
             formaction(elem)
         expected = normxml(td['form_html_out'])
         actual = normxml(html.tostring(elem))
         self.assertSequenceEqual(expected, actual)
Example #15
0
    def test__html_fromstring(self):
        from mobilize.handlers import _html_fromstring

        html_ref = '''<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''
        
        html_xml_encoding1 = '''<?xml version="1.0" encoding="UTF-8"?>
<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''
        
        testdata = [
        ('html_ref', html_ref),

        # with leading newlines
        ('html_plain1', '''


<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),
        
        # no doctype
        ('html_plain2', '''<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),
        
        # With XML encoding
        ('html_xml_encoding1', html_xml_encoding1),

        # With XML encoding, but with leading newline thrown in for good measure
        ('html_xml_encoding2', '''
<?xml version="1.0" encoding="UTF-8"?>
<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),

        # With XML encoding, but with a truly disturbing number of leading newlines.  IT WILL HAPPEN
        ('html_xml_encoding3', '\n' * 1024 + html_xml_encoding1),
        
        # With XML encoding and generous newlines interspersed
        ('html_xml_encoding4', '''

<?xml version="1.0" encoding="UTF-8"?>


<!doctype html>


<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),
        # mix case
        ('html_xml_mixcase1', '''<?XML version="1.0" encoding="UTF-8"?>
<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),

        ('html_xml_mixcase2', '''<?Xml version="1.0" encoding="UTF-8"?>
<!doctype html>
<html>
  <head><title>Hey</title></head>
  <body>
    <h1>Test Page</h1>
    <p>Have a nice day!</p>
  </body>
</html>'''),
        ]
        
        expected_html = normxml(html_ref)
        for ii, td in enumerate(testdata):
            label, html_input = td
            actual = _html_fromstring(html_input)
            actual_html = normxml(html.tostring(actual))
            self.assertEqual(expected_html, actual_html, '{} [{}]'.format(label, ii))
Example #16
0
    def test_resizeobject(self):
        from mobilize.filters import resizeobject
        testdata = [
            {'object_str' : '''<div class="foobar"><ul><li><object width="800" height="344">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US"/>
<param name="allowFullScreen" value="true"/>
<param name="allowscriptaccess" value="always"/>
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"/>
</object></li></ul></div>''',
             'resized_str' : '''<div class="foobar"><ul><li><object width="280" height="120">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed>
</object></li></ul></div>''',
             },
            {'object_str' : '''<object width="800" height="344">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"></embed>
</object>''',
             'resized_str' : '''<object width="280" height="120">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed>
</object>''',
             },
            {'object_str' : '''<OBJECT width="800" height="344">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US"/>
<param name="allowFullScreen" value="true"/>
<param name="allowscriptaccess" value="always"/>
<EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"/>
</OBJECT>''',
             'resized_str' : '''<object width="280" height="120">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed>
</object>''',
             },
            # If not height defined, or otherwise can't calculate aspect ratio, just ignore that attribute
            {'object_str' : '''<OBJECT width="800">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US"/>
<param name="allowFullScreen" value="true"/>
<param name="allowscriptaccess" value="always"/>
<EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800"/>
</OBJECT>''',
             'resized_str' : '''<object width="280">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed>
</object>''',
             },
            {'object_str' : '''<OBJECT>
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US"/>
<param name="allowFullScreen" value="true"/>
<param name="allowscriptaccess" value="always"/>
<EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true"/>
</OBJECT>''',
             'resized_str' : '''<object width="280">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed>
</object>''',
             },
            {'object_str' : '''<OBJECT width="800" height="beer">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US"/>
<param name="allowFullScreen" value="true"/>
<param name="allowscriptaccess" value="always"/>
<EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="beer"/>
</OBJECT>''',
             'resized_str' : '''<object width="280">
<param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US">
<param name="allowFullScreen" value="true">
<param name="allowscriptaccess" value="always">
<embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed>
</object>''',
             },
            {'object_str' : '''<p>Nothing to see here.</p>''',
             'resized_str' : '''<p>Nothing to see here.</p>''',
             },
            ]
        for ii, td in enumerate(testdata):
            object_elem = html.fragment_fromstring(td['object_str'], create_parent=False)
            resizeobject(object_elem)
            self.assertSequenceEqual(normxml(td['resized_str']), normxml(elem2str(object_elem)))
Example #17
0
    def test_table2divrows(self):
        testdata = [
            {'in_str' : '''<div><table>
      <tr>
        <td>Eggs</td>
        <td>Ham</td>
      </tr>
      <tr>
        <td>Beer</td>
        <td>Milk</td>
      </tr>
    </table></div>
''',
             'out_str' : '''<div>
  <div class="mwu-table2divrows">
    <div class="mwu-table2divrows-row0">
      <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div>
      <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div>
    </div>
    <div class="mwu-table2divrows-row1">
      <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div>
      <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div>
    </div>
  </div>
</div>
''',
             },
            #================
            {'in_str' : '''<div><table><tbody>
      <tr>
        <td>Eggs</td>
        <td>Ham</td>
      </tr>
      <tr>
        <td>Beer</td>
        <td>Milk</td>
      </tr>
    </tbody></table></div>
''',
             'out_str' : '''<div>
  <div class="mwu-table2divrows">
    <div class="mwu-table2divrows-row0">
      <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div>
      <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div>
    </div>
    <div class="mwu-table2divrows-row1">
      <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div>
      <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div>
    </div>
  </div>
</div>
''',
             },
            {'in_str' : '''<div><p>Nothing here.</p></div>''',
             'out_str' : '''<div><p>Nothing here.</p></div>''',
             },
            {'in_str' : '''<div><table>
      <tr>
        <td><table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table></td>
        <td>Key Lime Pie</td>
      </tr>
    </table></div>''',
             'out_str' : '''<div><div class="mwu-table2divrows">
<div class="mwu-table2divrows-row0">
    <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">
      <table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table>
    </div>
    <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Key Lime Pie</div>
    </div>
</div>
</div>''',
             },
            {'in_str' : '''<div><table>
      <tr>
        <td>
Does html like this exist somewhere in the wild?
<table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table>
<p>yeah, I bet somewhere it does</p>
(probably on some website that gets 10K hits on a slow day)
<table id="foobar"><tr><td>Game</td><td>Over Man</td></tr></table>
here's some extra trailing text for you too
</td>
        <td>Key Lime Pie</td>
      </tr>
    </table></div>''',
             'out_str' : '''<div><div class="mwu-table2divrows">
  <div class="mwu-table2divrows-row0">
    <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">
Does html like this exist somewhere in the wild?
<table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table>
<p>yeah, I bet somewhere it does</p>
(probably on some website that gets 10K hits on a slow day)
<table id="foobar"><tr><td>Game</td><td>Over Man</td></tr></table>
here's some extra trailing text for you too
    </div>
    <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Key Lime Pie</div>
    </div>
  </div>
</div>''',
             },
            {'in_str' : '''<table>
      <tr>
        <td>Eggs</td>
        <td>Ham</td>
      </tr>
      <tr>
        <td>Beer</td>
        <td>Milk</td>
      </tr>
    </table>
''',
             'out_str' : '''<div class="mwu-table2divrows">
  <div class="mwu-table2divrows-row0">
    <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div>
    <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div>
  </div>
  <div class="mwu-table2divrows-row1">
    <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div>
    <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div>
  </div>
</div>
''',
             },
            ]
        from mobilize.filters import table2divrows
        for ii, td in enumerate(testdata):
            in_elem = html.fragment_fromstring(td['in_str'], create_parent=False)
            table2divrows(in_elem)
            self.assertSequenceEqual(normxml(td['out_str']), normxml(elem2str(in_elem)))
Example #18
0
    def test_table2divgroups(self):
        from mobilize.filters.tables import Spec
        ELEMSTR1 = '''<div id="some-container">
<table>
      <tbody>
        <tr>
          <td>CONTACT US</td>
          <td>&nbsp;</td>
          <td>&nbsp;</td>
          <td>&nbsp;</td>
        <tr>
          <td>123 Main Str</td>
          <td>&nbsp;</td>
          <td>OUR TEAM</td>
          <td>&nbsp;</td>
        <tr>
          <td>Springfield, IL</td>
          <td>&nbsp;</td>
          <td>Mike Smith</td>
          <td><img src="/mike-smith.jpg"/></td>
        <tr>
          <td>1-800-BUY-DUFF</td>
          <td>&nbsp;</td>
          <td>Jen Jones</td>
          <td><img src="/jen-jones.jpg"/></td>
        <tr>
          <td>&nbsp;</td>
          <td>&nbsp;</td>
          <td>Scruffy</td>
          <td><img src="/scruffy-the-dog.jpg"/></td>
        <tr>
      </tbody>
    </table>
</div>
'''
        testdata = [
            {'elem_str' : ELEMSTR1,
             'specmap' : [],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname1', 0, 0, 0, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname1', 0, 0, 3, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname1', 0, 0, 0, 0)),
                    (Spec('idname2', 0, 0, 3, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
    </div>
    <div class="mwu-elem-table2divgroups-group" id="idname2">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname2', 0, 0, 3, 0)),
                    (Spec('idname1', 0, 0, 0, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname2">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
    </div>
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname2', 0, 0, 3, 0)),
                    (Spec('idname1', 0, 0, 0, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname2">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
    </div>
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname1', 0, 0, 4, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'omit_whitespace' : False,
             'specmap' : [
                    (Spec('idname1', 0, 0, 4, 0)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>CONTACT US</div>
      <div>123 Main Str</div>
      <div>Springfield, IL</div>
      <div>1-800-BUY-DUFF</div>
      <div>&#160;</div>
    </div>
  </div>
</div>
''',
             },
            {'elem_str' : ELEMSTR1,
             'specmap' : [
                    (Spec('idname1', 1, 2, 4, 3)),
                    ],
             'out_str' : '''
<div id="some-container">
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div>
        <div>OUR TEAM</div>
      </div>
      <div>
        <div>Mike Smith</div>
        <div><img src="/mike-smith.jpg"></div>
      </div>
      <div>
        <div>Jen Jones</div>
        <div><img src="/jen-jones.jpg"></div>
      </div>
      <div>
        <div>Scruffy</div>
        <div><img src="/scruffy-the-dog.jpg"></div>
      </div>
    </div>
  </div>
</div>
''',
             },
            
            {'elem_str' : '''<div>
<table>
<tr><td colspan="3">a</td></tr>
<tr>
  <td>b</td>
  <td>c</td>
  <td>d</td>
</tr>
</table>
''',
             'specmap' : [
                    (Spec('idname1', 0, 0, 1, 1)),
                    ],
             'out_str' : '''
<div>
  <div class="mwu-elem-table2divgroups">
    <div class="mwu-elem-table2divgroups-group" id="idname1">
      <div><div>a</div></div>
      <div>
        <div>b</div>
        <div>c</div>
      </div>
    </div>
  </div>
</div>
''',
             },
            ]
        from mobilize.filters import table2divgroups
        for ii, td in enumerate(testdata):
            omit_whitespace = td.get('omit_whitespace', True)
            elem = html.fromstring(td['elem_str'])
            table2divgroups(elem, td['specmap'], omit_whitespace=omit_whitespace)
            expected = normxml(td['out_str'])
            actual = normxml(elem2str(elem))
            self.assertSequenceEqual(expected, actual)
Example #19
0
    def test_omit(self):
        from lxml import html
        from mobilize.filters import omit
        ELEMSTR1 = '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<p id="child2">Child Numero Dos</p>
<div id="child3">Child Numero Tres</div>
</div>
'''
        testdata = [
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['div#child1'],
             'out_str' : '''<div class="foo">
<p id="child2">Child Numero Dos</p>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['div#child2'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<p id="child2">Child Numero Dos</p>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['p#child2'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['p'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['p#child2', 'div#child3'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : None,
             'csspaths' : ['div#child3', 'p#child2'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : ['./p'],
             'csspaths' : None,
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : ['.//p'],
             'csspaths' : None,
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
<div id="child3">Child Numero Tres</div>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : ['./div'],
             'csspaths' : None,
             'out_str' : '''<div class="foo">
<p id="child2">Child Numero Dos</p>
</div>''',
             },
            {'elem_str' : ELEMSTR1,
             'xpaths' : ['./p'],
             'csspaths' : ['div#child3'],
             'out_str' : '''<div class="foo">
<div id="child1">Child Numero Uno</div>
</div>''',
             },
            ]
        for ii, td in enumerate(testdata):
            elem = html.fromstring(td['elem_str'])
            omit(elem, xpaths=td['xpaths'], csspaths=td['csspaths'])
            expected = normxml(td['out_str'])
            actual = normxml(html.tostring(elem))
            self.assertSequenceEqual(expected, actual)

        # check that an arg is required
        testelem = html.fromstring(ELEMSTR1)
        self.assertRaises(AssertionError, omit, testelem)
        self.assertRaises(AssertionError, omit, testelem, [], [])