def test_omit_dupe_extractions(self): ''' Verify that selectors resolving to duplicate elements are discarded It's entirely possible for a list of N selectors (either xpath strings, or css path strings) to produce a list of extracted elements that have some duplicates. A real world example is when one client inconsistendly used <div id="content1"> and <div class="content1"> in their templates, while intending them to mean the same thing. A component catching both was CssPath(['div#content1', 'div.content1']). This worked fine until the client started creating divs like: <div class="content1" id="content1">. What we normally want to do is to keep only the first matching element, and perhaps log a warning. ''' from mobilize.components.extracted import CssPath doc_str = '''<html> <body> <div id="hello" class="blargh hello"> Hello, earthling! Are you... CRUNCHY </div> </body> </html>''' css_selectors = [ 'div.hello', 'div#hello', ] doc = html.fromstring(doc_str) component = CssPath(css_selectors) extracted = component.extract(doc) self.assertEqual(1, len(extracted))
def test_keep_if(self): from mobilize.components import CssPath from lxml import html html = html.fromstring('''<!doctype html> <html> <body> <div id="foo"> <a href="/beta">Read Beta</a> <a href="/alpha">Download Alpha</a> <a href="/gamma/">Experience Gamma</a> </div> </body> </html> ''') # verify default is to keep everything component_all = CssPath('div#foo a') extracted_all = component_all.extract(html) self.assertEqual(3, len(extracted_all)) # keep only the "download" link def pred(elem): return 'download' in elem.text.lower() component = CssPath('div#foo a', keep_if=pred) extracted = component.extract(html) self.assertEqual(1, len(extracted)) self.assertEqual('/alpha', extracted[0].attrib['href'])
def test_relevant_filter(self): ''' Tests for Filter.relevant ''' from mobilize.filters import filterapi, Filter from mobilize.components import CssPath # test filters # The three test filters t1, t2, and t3 mark a div with an attribute. # Only t1 and t3 are meant to be relevant; t2 is not. @filterapi def tf1(elem): elem.attrib['tf1'] = '1' class TF2(Filter): def __call__(self, elem): elem.attrib['tf2'] = '2' def relevant(self, reqinfo): return False tf2 = TF2() class TF3(Filter): def __call__(self, elem): elem.attrib['tf3'] = '3' def relevant(self, reqinfo): return True tf3 = TF3() root = html.fromstring('<div id="foo">Hello.</div>') component = CssPath('div#foo') component.extract(root) actual = component.process('idname', extra_filters=[tf1, tf2, tf3]) actual_str = html.tostring(actual).decode('utf-8') self.assertSequenceEqual('<div class="mwu-elem" id="idname"><div id="foo" tf1="1" tf3="3">Hello.</div></div>', actual_str)
def test_select_multiple(self): ''' Test that extracted components can accept multiple selectors ''' from mobilize.components import CssPath, XPath selectors = [ 'nav', 'section', ] src_html = '''<div> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> <table><tr><td> </td><td>I'm using tables for layout!!! DUR</td></tr></table> <section> <p>Hello.</p> </section> </div> ''' expected_html = '''<div class="mwu-elem" id="foo"> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> <section> <p>Hello.</p> </section> </div>''' # test for CssPath css_component = CssPath(selectors, idname='foo') css_component.extract(html.fromstring(src_html)) extracted = css_component.process() extracted_str = html.tostring(extracted) self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str)) # test for XPath x_component = XPath(selectors, idname='foo') x_component.extract(html.fromstring(src_html)) extracted = x_component.process() extracted_str = html.tostring(extracted) self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))
def test_process_idname(self): from mobilize.components import CssPath, XPath src_html = '''<div> <nav> <a href="/A">A</a> <a href="/B">B</a> </nav> ''' def prep_component(**kw): return c # check that default_idname is required if self.idname not defined c1 = CssPath('nav') c1.extract(html.fromstring(src_html)) with self.assertRaises(AssertionError): c1.process() # check that idname argument c2 = CssPath('nav', idname='foo') c2.extract(html.fromstring(src_html)) c2.process() # no AssertionError on this line meanst the test passes # check that default_idname supresses the error c3 = CssPath('nav') c3.extract(html.fromstring(src_html)) c3.process('foo') # no AssertionError on this line meanst the test passes