def test_absimgsrc(self): html1_in = '''<div> <p>Hi there.</p> <img src="http://foo.example.com/path/to/bananas.jpg" alt="yellow fruit" width="23" height="480"> <img src="/_mwu/bananatree.jpg" alt="where they come from"> <p>Here's some more.</p> <img src="/fruitpics/strawberry.jpg" alt="berry good"> <p>and then: <img src="standard/nrolling-kiwi.gif" alt="delicious but takes time to peel"> <img src="data:image/gif;base64,R0lGODlhQgEDAJEAANTW2Pr06rO8yAAAACH5BAAAAAAALAAAAABCAQMAAAIilI+0Po5y02ouz3lyDDobiSJbmiXZA8KXuC8fyTDdrApy+QA7 alt="GIF data URL"/> <img src="DATA:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAkCAMAAACpD3pbAAAAYFBMVEVZbYftuWNoP0iYbFJDHkBcNEWAVk3JmFzVo17hrmG8jVmQk5+8sJOwgldse4xPKUN0S0r3yXdLJULcwpI4FD75x2f5yniSmJ07Fz/WpF6MYU/wyoShn5XTvZT84bL5xGZPqaqJAAAAdUlEQVQ4y+XTRw7EQAgEQJg8jmtvdOz//9IvaE57c19LQoBAvjuNiMgJmkdrMl69yXjbfNyFtfMN5zo4P0RlPCYAzjN+KoDsGIcKoKGcRkAjLb78JhcS7TyuJRc6WIrWWrqgFodsLrXc95j+yq3Jm/X+n7mXC9defIzz7p9PAAAAAElFTkSuQmCC" alt="PNG data URL"/> <img src="" alt="Pathological HTML!"> </p> </div>''' html1_out = '''<div> <p>Hi there.</p> <img src="http://foo.example.com/path/to/bananas.jpg" alt="yellow fruit" width="23" height="480"> <img src="/_mwu/bananatree.jpg" alt="where they come from"> <p>Here's some more.</p> <img src="http://desktop.example.com/fruitpics/strawberry.jpg" alt="berry good"> <p>and then: <img src="http://desktop.example.com/articles/standard/nrolling-kiwi.gif" alt="delicious but takes time to peel"> <img src="data:image/gif;base64,R0lGODlhQgEDAJEAANTW2Pr06rO8yAAAACH5BAAAAAAALAAAAABCAQMAAAIilI+0Po5y02ouz3lyDDobiSJbmiXZA8KXuC8fyTDdrApy+QA7 alt="GIF data URL"/> <img src="DATA:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAkCAMAAACpD3pbAAAAYFBMVEVZbYftuWNoP0iYbFJDHkBcNEWAVk3JmFzVo17hrmG8jVmQk5+8sJOwgldse4xPKUN0S0r3yXdLJULcwpI4FD75x2f5yniSmJ07Fz/WpF6MYU/wyoShn5XTvZT84bL5xGZPqaqJAAAAdUlEQVQ4y+XTRw7EQAgEQJg8jmtvdOz//9IvaE57c19LQoBAvjuNiMgJmkdrMl69yXjbfNyFtfMN5zo4P0RlPCYAzjN+KoDsGIcKoKGcRkAjLb78JhcS7TyuJRc6WIrWWrqgFodsLrXc95j+yq3Jm/X+n7mXC9defIzz7p9PAAAAAElFTkSuQmCC" alt="PNG data URL"/> <img src="" alt="Pathological HTML!"> </p> </div>''' desktop_url = 'http://desktop.example.com/articles/delicious.html' from mobilize.filters.misc import absimgsrc elem = html.fromstring(html1_in) absimgsrc(elem, desktop_url) result = elem2str(elem) self.assertSequenceEqual(normxml(html1_out), normxml(result))
def test_GoogleAnalytics_none(self): # Check negative case where we expect to not find GA tracking codes from mobilize.components import GoogleAnalytics doc_str = open(data_file_path('whole-html', 'cnn.html')).read() doc = html.fromstring(doc_str) noga = GoogleAnalytics() noga.extract(doc) noga.process() actual = normxml(noga.html()) expected = normxml('''<div class="mwu-elem" id="mwu-elem-ga"></div>''') self.assertSequenceEqual(expected, actual)
def test_noattribs(self): ELEMSTR1 = '''<table width="600" style="color: fuscia;"> <tr><td width="200">one</td><td>two</td></tr> <tr><td>three</td><td>four</td></tr> </table> ''' ELEMSTR2 = '''<div> <table width="600" style="color: fuscia;"> <tr><td width="200">one</td><td>two</td></tr> <tr><td>three</td><td>four</td></tr> </table> </div> ''' testdata = [ {'in_str' : ELEMSTR1, 'tags' : ['table'], 'attribs' : ['width', 'style'], 'out_str' : '''<table> <tr><td width="200">one</td><td>two</td></tr> <tr><td>three</td><td>four</td></tr> </table> ''' }, {'in_str' : ELEMSTR2, 'tags' : ['table'], 'attribs' : ['width', 'style'], 'out_str' : '''<div> <table> <tr><td width="200">one</td><td>two</td></tr> <tr><td>three</td><td>four</td></tr> </table> </div> ''' }, {'in_str' : ELEMSTR2, 'tags' : ['table', 'td'], 'attribs' : ['width', 'style'], 'out_str' : '''<div> <table> <tr><td>one</td><td>two</td></tr> <tr><td>three</td><td>four</td></tr> </table> </div> ''' }, ] from mobilize.filters import noattribs for ii, td in enumerate(testdata): elem = html.fragment_fromstring(td['in_str'], create_parent=False) noattribs(elem, td['tags'], td['attribs']) expected = normxml(td['out_str']) actual = normxml(elem2str(elem)) self.assertSequenceEqual(expected, actual)
def test_collapse(self): ''' Test for collapsing filter application mode ''' from mobilize.components import ( XPath, FILT_EACHELEM, FILT_COLLAPSED, ) def testfilter(elem): if elem.tag == 'a': elem.attrib['class'] = 'foo' for ii, child in enumerate(elem): if 'a' == child.tag: child.attrib['id'] = 'child-%d' % ii htmlstr1 = '''<a href="/">a</a> <a href="/">b</a> <a href="/">c</a> ''' nocollapse = XPath('//a', postfilters=[testfilter], filtermode=FILT_EACHELEM) nocollapse.extract(html.fromstring(htmlstr1)) actual = nocollapse.process('idname') actual_str = html.tostring(actual) expected_str = '''<div class="mwu-elem" id="idname"> <a href="/" class="foo">a</a> <a href="/" class="foo">b</a> <a href="/" class="foo">c</a> </div> ''' self.assertSequenceEqual(normxml(expected_str), normxml(actual_str)) expected_str = '''<div id="idname"> <a href="/">a</a> <a href="/">b</a> <a href="/">c</a> </div> ''' collapse = XPath('//a', postfilters=[testfilter], filtermode=FILT_COLLAPSED) collapse.extract(html.fromstring(htmlstr1)) actual = collapse.process('idname') actual_str = html.tostring(actual) expected_str = '''<div class="mwu-elem" id="idname"> <a href="/" id="child-0">a</a> <a href="/" id="child-1">b</a> <a href="/" id="child-2">c</a> </div> ''' self.assertSequenceEqual(normxml(expected_str), normxml(actual_str))
def test_extract_csspath(self): from mobilize.components import CssPath testdata = [ {'datafile' : 'a.xml', 'components' : [CssPath('div#happy', classvalue='some-class')], 'extracted' : ['<div class="some-class" id="some-id"><div id="happy">lucky</div></div>'], }, {'datafile' : 'b.xml', 'components' : [CssPath('div#joyful', classvalue='some-class')], 'extracted' : ['<div class="some-class" id="some-id"><div id="joyful">fun</div></div>'], }, {'datafile' : 'c.xml', 'components' : [CssPath('p.graceful', classvalue='some-class')], 'extracted' : ['<div class="some-class" id="some-id"><p class="graceful">laughing</p></div>'], }, {'datafile' : 'd.xml', 'components' : [CssPath('p.graceful', classvalue='some-class')], 'extracted' : ['<div class="some-class" id="some-id"><p class="skipping graceful enthusiastic">laughing</p></div>'], }, {'datafile' : 'e.xml', 'components' : [CssPath('p.graceful', classvalue='some-class')], 'extracted' : ['<div class="some-class" id="some-id"><p class="skipping graceful enthusiastic">laughing</p><p class="graceful">enthusiastic</p></div>'], }, ] for ii, td in enumerate(testdata): doc = html.fromstring(open(data_file_path('extract_celems', td['datafile'])).read()) for sel in td['components']: sel.extract(doc) sel.process('some-id') expected = list(map(normxml, td['extracted'])) actual = [normxml(sel.html()) for sel in td['components']] msg = 'e: %s, a: %s [%d %s]' % (expected, actual, ii, td['datafile']) self.assertEqual(expected, actual, msg)
def test_relhyperlinks(self): from mobilize.filters import ( relhyperlinks, relhyperlinks_full, ) htmlA=''' <div> <ul> <li><a href="http://alpha.com">Alpha home page</a></li> <li><a href="http://www.alpha.com">Alt Alpha home page</a></li> <li><a href="http://beta.com">Beta home page</a></li> </ul> <p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="http://alpha.com/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="https://alpha.com/secure/about/drinks/coffee">coffee</a>.</p> </div> ''' root1 = html.fromstring(htmlA) relhyperlinks(root1, 'alpha.com') actual1 = html.tostring(root1) expected1 = ''' <div> <ul> <li><a href="/">Alpha home page</a></li> <li><a href="http://www.alpha.com">Alt Alpha home page</a></li> <li><a href="http://beta.com">Beta home page</a></li> </ul> <p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="https://alpha.com/secure/about/drinks/coffee">coffee</a>.</p> </div> ''' self.assertSequenceEqual(normxml(expected1), normxml(actual1)) root2 = html.fromstring(htmlA) relhyperlinks_full(root2, ['alpha.com', 'www.alpha.com'], ['http', 'https']) actual2 = html.tostring(root2) expected2 = ''' <div> <ul> <li><a href="/">Alpha home page</a></li> <li><a href="/">Alt Alpha home page</a></li> <li><a href="http://beta.com">Beta home page</a></li> </ul> <p>The beautiful <a href="/about/birds/cranes">white cranes</a> of <a href="/places/Lancashire">Lancashire</a> drink surprising amounts of <a href="/secure/about/drinks/coffee">coffee</a>.</p> </div> ''' self.assertSequenceEqual(normxml(expected2), normxml(actual2))
def test_style(self): '''test that style attribute is set properly''' style = 'background-color: red; font-size: large;' sourcestr = '''<ul> <li>Dre</li> <li>Snoop</li> <li>Thug Life</li> </ul>''' extracted = DirectExtracted('', style=style) extracted._sourcestr = sourcestr extracted.extract(html.fromstring(sourcestr)) extracted.process('foo') rendered = extracted.elem # verify that the first child is the source string... firstchild_elem = rendered[0] self.assertSequenceEqual(normxml(sourcestr), normxml(html.tostring(firstchild_elem))) # check the style attribute self.assertEqual(style, rendered.attrib['style'])
def test_select_multiple(self): ''' Test that extracted components can accept multiple selectors ''' from mobilize.components import CssPath, XPath selectors = [ 'nav', 'section', ] src_html = '''<div> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> <table><tr><td> </td><td>I'm using tables for layout!!! DUR</td></tr></table> <section> <p>Hello.</p> </section> </div> ''' expected_html = '''<div class="mwu-elem" id="foo"> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> <section> <p>Hello.</p> </section> </div>''' # test for CssPath css_component = CssPath(selectors, idname='foo') css_component.extract(html.fromstring(src_html)) extracted = css_component.process() extracted_str = html.tostring(extracted) self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str)) # test for XPath x_component = XPath(selectors, idname='foo') x_component.extract(html.fromstring(src_html)) extracted = x_component.process() extracted_str = html.tostring(extracted) self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))
def test_squeezebr(self): from mobilize.filters import squeezebr testdata = [ {'in_str' : '''<p>Hi.</p>''', 'out_str' : '''<p>Hi.</p>''', }, {'in_str' : '''<p>Hi.<br>Hey.</p>''', 'out_str' : '''<p>Hi.<br>Hey.</p>''', }, {'in_str' : '''<p>Hi.<br><br>Hey.</p>''', 'out_str' : '''<p>Hi.<br>Hey.</p>''', }, {'in_str' : '''<p>Hi.<br/><br/><br/><br/><br/><br/><br/><br/><br/>Hey.</p>''', 'out_str' : '''<p>Hi.<br>Hey.</p>''', }, {'in_str' : '''<div> <p>Hi.<br><br>Hey.</p> <p>This is some more text <br><br><br><br><br><img src="foo.png" alt="foo"/> </p> </div>''', 'out_str' : '''<div> <p>Hi.<br>Hey.</p> <p>This is some more text <br><img src="foo.png" alt="foo"> </p> </div> ''', }, {'in_str' : '''<p>Hi.<br> <br>Hey.</p>''', 'out_str' : '''<p>Hi.<br>Hey.</p>''', }, {'in_str' : '''<p>Hi.<br>How.<br>Hey.</p>''', 'out_str' : '''<p>Hi.<br>How.<br>Hey.</p>''', }, ] for ii, td in enumerate(testdata): elem = html.fragment_fromstring(td['in_str'], create_parent=False) squeezebr(elem) expected = normxml(td['out_str']) actual = normxml(elem2str(elem)) self.assertSequenceEqual(expected, actual)
def test_abslinkfilesrc(self): from mobilize.filters import abslinkfilesrc html_in = '''<div> <p><a href="marketstudy.xls">Market Study</a></p> <p><a href="/whitepapers/fill-in-blank.doc">Make your own white paper!</a></p> <p><a href="/whitepapers/widgets.pdf">Widget White Paper</a></p> <p><a href="">HTML Pathology 101</a></p> </div>''' html_out ='''<div> <p><a href="http://example.com/about/marketstudy.xls">Market Study</a></p> <p><a href="/whitepapers/fill-in-blank.doc">Make your own white paper!</a></p> <p><a href="http://example.com/whitepapers/widgets.pdf">Widget White Paper</a></p> <p><a href="">HTML Pathology 101</a></p> </div>''' desktop_url = 'http://example.com/about/papers.html' extensions=['.xls', '.pdf'] elem = html.fromstring(html_in) abslinkfilesrc(elem, desktop_url, extensions) result = html.tostring(elem) self.assertSequenceEqual(normxml(html_out), normxml(result))
def test_resizeiframe(self): from mobilize.filters import resizeiframe testdata = [ {'iframe_str' : '''<p> <iframe width="533" height="330" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe> </p>''', 'resized_str' : '''<p> <iframe width="280" height="173" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe> </p>''', }, {'iframe_str' : '''<iframe width="533" height="330" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>''', 'resized_str' : '''<iframe width="280" height="173" frameborder="0" allowfullscreen="" src="http://www.youtube.com/embed/HE6uqPPrVfo" title="YouTube video player"></iframe>''', }, {'iframe_str' : '''<p>Nothing to see here.</p>''', 'resized_str' : '''<p>Nothing to see here.</p>''', }, ] for ii, td in enumerate(testdata): iframe_elem = html.fragment_fromstring(td['iframe_str'], create_parent=False) resizeiframe(iframe_elem) self.assertSequenceEqual(normxml(td['resized_str']), normxml(elem2str(iframe_elem)))
def test_innerhtml(self): from mobilize.components import XPath html_str = '''<table><tr><td>Hello</td></tr></table>''' # test for innerhtml=False component_f = XPath('//td', idname='foo', innerhtml=False) component_f.extract(html.fromstring(html_str)) extracted = component_f.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>' e = normxml(expected) a = normxml(extracted_str) self.assertSequenceEqual(e, a) # test for innerhtml=True component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(html_str)) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo">Hello</div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str)) # test for ineffectiveness of innerhtml=True with multiple matching elements component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(''' <table><tr> <td>Hello</td> <td>Goodbye</td> </tr></table> ''')) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
def test_formcontroltypes(self): from mobilize.filters import formcontroltypes instr = '''<form> <dl> <dt>Name</dt> <dd><input type="text" name="name"/></dd> <dt>Email</dt> <dd><input type="email" name="email"/></dd> <dt>Favorite color</dt> <dd> <ul> <li><input type="radio" name="color" value="red" class="nonstandard"/>Red</li> <li><input type="radio" name="color" value="blue" class="nonstandard"/>Blue</li> <li><input type="radio" name="color" value="green" class="nonstandard"/>Green</li> </ul> </dd> </dl> </form>''' expected = '''<form> <dl> <dt>Name</dt> <dd><input type="text" name="name" class="mwu-fc-input-text"/></dd> <dt>Email</dt> <dd><input type="email" name="email" class="mwu-fc-input-email"/></dd> <dt>Favorite color</dt> <dd> <ul> <li><input type="radio" name="color" value="red" class="nonstandard mwu-fc-input-radio"/>Red</li> <li><input type="radio" name="color" value="blue" class="nonstandard mwu-fc-input-radio"/>Blue</li> <li><input type="radio" name="color" value="green" class="nonstandard mwu-fc-input-radio"/>Green</li> </ul> </dd> </dl> </form>''' root_elem = html.fromstring(instr) formcontroltypes(root_elem) actual = html.tostring(root_elem) self.assertSequenceEqual(normxml(expected), normxml(actual))
def test_formaction(self): from mobilize.filters import formaction testdata = [ {'form_html_in' : '''<div><form action="http://example.com/foo/" ><input type="text" name="bar"></form></div>''', 'form_html_out' : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''', }, {'form_html_in' : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''', 'form_html_out' : '''<div><form action="/foo/" ><input type="text" name="bar"></form></div>''', }, {'form_html_in' : '''<div><form action="http://example.com/foo/" ><input type="text" name="bar"></form></div>''', 'urlprefix' : 'https://mobilewebup.com/', 'form_html_out' : '''<div><form action="https://mobilewebup.com/foo/" ><input type="text" name="bar"></form></div>''', }, ] for ii, td in enumerate(testdata): elem = html.fromstring(td['form_html_in']) if 'urlprefix' in td: formaction(elem, td['urlprefix']) else: formaction(elem) expected = normxml(td['form_html_out']) actual = normxml(html.tostring(elem)) self.assertSequenceEqual(expected, actual)
def test__html_fromstring(self): from mobilize.handlers import _html_fromstring html_ref = '''<!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>''' html_xml_encoding1 = '''<?xml version="1.0" encoding="UTF-8"?> <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>''' testdata = [ ('html_ref', html_ref), # with leading newlines ('html_plain1', ''' <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), # no doctype ('html_plain2', '''<html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), # With XML encoding ('html_xml_encoding1', html_xml_encoding1), # With XML encoding, but with leading newline thrown in for good measure ('html_xml_encoding2', ''' <?xml version="1.0" encoding="UTF-8"?> <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), # With XML encoding, but with a truly disturbing number of leading newlines. IT WILL HAPPEN ('html_xml_encoding3', '\n' * 1024 + html_xml_encoding1), # With XML encoding and generous newlines interspersed ('html_xml_encoding4', ''' <?xml version="1.0" encoding="UTF-8"?> <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), # mix case ('html_xml_mixcase1', '''<?XML version="1.0" encoding="UTF-8"?> <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), ('html_xml_mixcase2', '''<?Xml version="1.0" encoding="UTF-8"?> <!doctype html> <html> <head><title>Hey</title></head> <body> <h1>Test Page</h1> <p>Have a nice day!</p> </body> </html>'''), ] expected_html = normxml(html_ref) for ii, td in enumerate(testdata): label, html_input = td actual = _html_fromstring(html_input) actual_html = normxml(html.tostring(actual)) self.assertEqual(expected_html, actual_html, '{} [{}]'.format(label, ii))
def test_resizeobject(self): from mobilize.filters import resizeobject testdata = [ {'object_str' : '''<div class="foobar"><ul><li><object width="800" height="344"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"/> <param name="allowFullScreen" value="true"/> <param name="allowscriptaccess" value="always"/> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"/> </object></li></ul></div>''', 'resized_str' : '''<div class="foobar"><ul><li><object width="280" height="120"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed> </object></li></ul></div>''', }, {'object_str' : '''<object width="800" height="344"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"></embed> </object>''', 'resized_str' : '''<object width="280" height="120"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed> </object>''', }, {'object_str' : '''<OBJECT width="800" height="344"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"/> <param name="allowFullScreen" value="true"/> <param name="allowscriptaccess" value="always"/> <EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="344"/> </OBJECT>''', 'resized_str' : '''<object width="280" height="120"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280" height="120"></embed> </object>''', }, # If not height defined, or otherwise can't calculate aspect ratio, just ignore that attribute {'object_str' : '''<OBJECT width="800"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"/> <param name="allowFullScreen" value="true"/> <param name="allowscriptaccess" value="always"/> <EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800"/> </OBJECT>''', 'resized_str' : '''<object width="280"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed> </object>''', }, {'object_str' : '''<OBJECT> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"/> <param name="allowFullScreen" value="true"/> <param name="allowscriptaccess" value="always"/> <EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true"/> </OBJECT>''', 'resized_str' : '''<object width="280"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed> </object>''', }, {'object_str' : '''<OBJECT width="800" height="beer"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"/> <param name="allowFullScreen" value="true"/> <param name="allowscriptaccess" value="always"/> <EMBED src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="800" height="beer"/> </OBJECT>''', 'resized_str' : '''<object width="280"> <param name="movie" value="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US"> <param name="allowFullScreen" value="true"> <param name="allowscriptaccess" value="always"> <embed src="http://www.youtube.com/v/fJ8FGIQG8gM?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="280"></embed> </object>''', }, {'object_str' : '''<p>Nothing to see here.</p>''', 'resized_str' : '''<p>Nothing to see here.</p>''', }, ] for ii, td in enumerate(testdata): object_elem = html.fragment_fromstring(td['object_str'], create_parent=False) resizeobject(object_elem) self.assertSequenceEqual(normxml(td['resized_str']), normxml(elem2str(object_elem)))
def test_table2divrows(self): testdata = [ {'in_str' : '''<div><table> <tr> <td>Eggs</td> <td>Ham</td> </tr> <tr> <td>Beer</td> <td>Milk</td> </tr> </table></div> ''', 'out_str' : '''<div> <div class="mwu-table2divrows"> <div class="mwu-table2divrows-row0"> <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div> <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div> </div> <div class="mwu-table2divrows-row1"> <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div> <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div> </div> </div> </div> ''', }, #================ {'in_str' : '''<div><table><tbody> <tr> <td>Eggs</td> <td>Ham</td> </tr> <tr> <td>Beer</td> <td>Milk</td> </tr> </tbody></table></div> ''', 'out_str' : '''<div> <div class="mwu-table2divrows"> <div class="mwu-table2divrows-row0"> <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div> <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div> </div> <div class="mwu-table2divrows-row1"> <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div> <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div> </div> </div> </div> ''', }, {'in_str' : '''<div><p>Nothing here.</p></div>''', 'out_str' : '''<div><p>Nothing here.</p></div>''', }, {'in_str' : '''<div><table> <tr> <td><table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table></td> <td>Key Lime Pie</td> </tr> </table></div>''', 'out_str' : '''<div><div class="mwu-table2divrows"> <div class="mwu-table2divrows-row0"> <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0"> <table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table> </div> <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Key Lime Pie</div> </div> </div> </div>''', }, {'in_str' : '''<div><table> <tr> <td> Does html like this exist somewhere in the wild? <table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table> <p>yeah, I bet somewhere it does</p> (probably on some website that gets 10K hits on a slow day) <table id="foobar"><tr><td>Game</td><td>Over Man</td></tr></table> here's some extra trailing text for you too </td> <td>Key Lime Pie</td> </tr> </table></div>''', 'out_str' : '''<div><div class="mwu-table2divrows"> <div class="mwu-table2divrows-row0"> <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0"> Does html like this exist somewhere in the wild? <table id="foobar"><tr><td>Whoa</td><td>dude</td></tr></table> <p>yeah, I bet somewhere it does</p> (probably on some website that gets 10K hits on a slow day) <table id="foobar"><tr><td>Game</td><td>Over Man</td></tr></table> here's some extra trailing text for you too </div> <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Key Lime Pie</div> </div> </div> </div>''', }, {'in_str' : '''<table> <tr> <td>Eggs</td> <td>Ham</td> </tr> <tr> <td>Beer</td> <td>Milk</td> </tr> </table> ''', 'out_str' : '''<div class="mwu-table2divrows"> <div class="mwu-table2divrows-row0"> <div class="mwu-table2divrows-row0-col0 mwu-table2divrows-col0">Eggs</div> <div class="mwu-table2divrows-row0-col1 mwu-table2divrows-col1">Ham</div> </div> <div class="mwu-table2divrows-row1"> <div class="mwu-table2divrows-row1-col0 mwu-table2divrows-col0">Beer</div> <div class="mwu-table2divrows-row1-col1 mwu-table2divrows-col1">Milk</div> </div> </div> ''', }, ] from mobilize.filters import table2divrows for ii, td in enumerate(testdata): in_elem = html.fragment_fromstring(td['in_str'], create_parent=False) table2divrows(in_elem) self.assertSequenceEqual(normxml(td['out_str']), normxml(elem2str(in_elem)))
def test_table2divgroups(self): from mobilize.filters.tables import Spec ELEMSTR1 = '''<div id="some-container"> <table> <tbody> <tr> <td>CONTACT US</td> <td> </td> <td> </td> <td> </td> <tr> <td>123 Main Str</td> <td> </td> <td>OUR TEAM</td> <td> </td> <tr> <td>Springfield, IL</td> <td> </td> <td>Mike Smith</td> <td><img src="/mike-smith.jpg"/></td> <tr> <td>1-800-BUY-DUFF</td> <td> </td> <td>Jen Jones</td> <td><img src="/jen-jones.jpg"/></td> <tr> <td> </td> <td> </td> <td>Scruffy</td> <td><img src="/scruffy-the-dog.jpg"/></td> <tr> </tbody> </table> </div> ''' testdata = [ {'elem_str' : ELEMSTR1, 'specmap' : [], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname1', 0, 0, 0, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname1', 0, 0, 3, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname1', 0, 0, 0, 0)), (Spec('idname2', 0, 0, 3, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> </div> <div class="mwu-elem-table2divgroups-group" id="idname2"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname2', 0, 0, 3, 0)), (Spec('idname1', 0, 0, 0, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname2"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> </div> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname2', 0, 0, 3, 0)), (Spec('idname1', 0, 0, 0, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname2"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> </div> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname1', 0, 0, 4, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'omit_whitespace' : False, 'specmap' : [ (Spec('idname1', 0, 0, 4, 0)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div>CONTACT US</div> <div>123 Main Str</div> <div>Springfield, IL</div> <div>1-800-BUY-DUFF</div> <div> </div> </div> </div> </div> ''', }, {'elem_str' : ELEMSTR1, 'specmap' : [ (Spec('idname1', 1, 2, 4, 3)), ], 'out_str' : ''' <div id="some-container"> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div> <div>OUR TEAM</div> </div> <div> <div>Mike Smith</div> <div><img src="/mike-smith.jpg"></div> </div> <div> <div>Jen Jones</div> <div><img src="/jen-jones.jpg"></div> </div> <div> <div>Scruffy</div> <div><img src="/scruffy-the-dog.jpg"></div> </div> </div> </div> </div> ''', }, {'elem_str' : '''<div> <table> <tr><td colspan="3">a</td></tr> <tr> <td>b</td> <td>c</td> <td>d</td> </tr> </table> ''', 'specmap' : [ (Spec('idname1', 0, 0, 1, 1)), ], 'out_str' : ''' <div> <div class="mwu-elem-table2divgroups"> <div class="mwu-elem-table2divgroups-group" id="idname1"> <div><div>a</div></div> <div> <div>b</div> <div>c</div> </div> </div> </div> </div> ''', }, ] from mobilize.filters import table2divgroups for ii, td in enumerate(testdata): omit_whitespace = td.get('omit_whitespace', True) elem = html.fromstring(td['elem_str']) table2divgroups(elem, td['specmap'], omit_whitespace=omit_whitespace) expected = normxml(td['out_str']) actual = normxml(elem2str(elem)) self.assertSequenceEqual(expected, actual)
def test_omit(self): from lxml import html from mobilize.filters import omit ELEMSTR1 = '''<div class="foo"> <div id="child1">Child Numero Uno</div> <p id="child2">Child Numero Dos</p> <div id="child3">Child Numero Tres</div> </div> ''' testdata = [ {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['div#child1'], 'out_str' : '''<div class="foo"> <p id="child2">Child Numero Dos</p> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['div#child2'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> <p id="child2">Child Numero Dos</p> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['p#child2'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['p'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['p#child2', 'div#child3'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : None, 'csspaths' : ['div#child3', 'p#child2'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : ['./p'], 'csspaths' : None, 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : ['.//p'], 'csspaths' : None, 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> <div id="child3">Child Numero Tres</div> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : ['./div'], 'csspaths' : None, 'out_str' : '''<div class="foo"> <p id="child2">Child Numero Dos</p> </div>''', }, {'elem_str' : ELEMSTR1, 'xpaths' : ['./p'], 'csspaths' : ['div#child3'], 'out_str' : '''<div class="foo"> <div id="child1">Child Numero Uno</div> </div>''', }, ] for ii, td in enumerate(testdata): elem = html.fromstring(td['elem_str']) omit(elem, xpaths=td['xpaths'], csspaths=td['csspaths']) expected = normxml(td['out_str']) actual = normxml(html.tostring(elem)) self.assertSequenceEqual(expected, actual) # check that an arg is required testelem = html.fromstring(ELEMSTR1) self.assertRaises(AssertionError, omit, testelem) self.assertRaises(AssertionError, omit, testelem, [], [])