コード例 #1
0
ファイル: test_components.py プロジェクト: redsymbol/mobilize
    def test_omit_dupe_extractions(self):
        '''
        Verify that selectors resolving to duplicate elements are discarded

        It's entirely possible for a list of N selectors (either xpath
        strings, or css path strings) to produce a list of extracted
        elements that have some duplicates.  A real world example is
        when one client inconsistendly used <div id="content1"> and
        <div class="content1"> in their templates, while intending
        them to mean the same thing.  A component catching both was
        CssPath(['div#content1', 'div.content1']).  This worked fine
        until the client started creating divs like: <div
        class="content1" id="content1">.

        What we normally want to do is to keep only the first matching
        element, and perhaps log a warning.
        '''
        from mobilize.components.extracted import CssPath
        doc_str = '''<html>
  <body>
    <div id="hello" class="blargh hello">
      Hello, earthling! Are you... CRUNCHY
    </div>
  </body>
</html>'''
        css_selectors = [
            'div.hello',
            'div#hello',
            ]
        doc = html.fromstring(doc_str)
        component = CssPath(css_selectors)
        extracted = component.extract(doc)
        self.assertEqual(1, len(extracted))
コード例 #2
0
ファイル: test_components.py プロジェクト: redsymbol/mobilize
    def test_keep_if(self):
        from mobilize.components import CssPath
        from lxml import html
        html = html.fromstring('''<!doctype html>
<html>
<body>
<div id="foo">
  <a href="/beta">Read Beta</a>
  <a href="/alpha">Download Alpha</a>
  <a href="/gamma/">Experience Gamma</a>
</div>
</body>
</html>
''')
        # verify default is to keep everything
        component_all = CssPath('div#foo a')
        extracted_all = component_all.extract(html)
        self.assertEqual(3, len(extracted_all))

        # keep only the "download" link
        def pred(elem):
            return 'download' in elem.text.lower()
        component = CssPath('div#foo a', keep_if=pred)
        extracted = component.extract(html)
        self.assertEqual(1, len(extracted))
        self.assertEqual('/alpha', extracted[0].attrib['href'])
コード例 #3
0
ファイル: test_filters.py プロジェクト: redsymbol/mobilize
 def test_relevant_filter(self):
     '''
     Tests for Filter.relevant
     '''
     from mobilize.filters import filterapi, Filter
     from mobilize.components import CssPath
     # test filters
     # The three test filters t1, t2, and t3 mark a div with an attribute.
     # Only t1 and t3 are meant to be relevant; t2 is not.
     @filterapi
     def tf1(elem):
         elem.attrib['tf1'] = '1'
     class TF2(Filter):
         def __call__(self, elem):
             elem.attrib['tf2'] = '2'
         def relevant(self, reqinfo):
             return False
     tf2 = TF2()
     class TF3(Filter):
         def __call__(self, elem):
             elem.attrib['tf3'] = '3'
         def relevant(self, reqinfo):
             return True
     tf3 = TF3()
     root = html.fromstring('<div id="foo">Hello.</div>')
     component = CssPath('div#foo')
     component.extract(root)
     actual = component.process('idname', extra_filters=[tf1, tf2, tf3])
     actual_str = html.tostring(actual).decode('utf-8')
     self.assertSequenceEqual('<div class="mwu-elem" id="idname"><div id="foo" tf1="1" tf3="3">Hello.</div></div>', actual_str)
コード例 #4
0
ファイル: test_components.py プロジェクト: redsymbol/mobilize
    def test_select_multiple(self):
        '''
        Test that extracted components can accept multiple selectors
        '''
        from mobilize.components import CssPath, XPath
        selectors = [
            'nav',
            'section',
            ]
        src_html = '''<div>
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
<table><tr><td>&nbsp;</td><td>I'm using tables for layout!!! DUR</td></tr></table>
<section>
<p>Hello.</p>
</section>
</div>
'''
        expected_html = '''<div class="mwu-elem" id="foo">
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
<section>
<p>Hello.</p>
</section>
</div>'''
        # test for CssPath
        css_component = CssPath(selectors, idname='foo')
        css_component.extract(html.fromstring(src_html))
        extracted = css_component.process()
        extracted_str = html.tostring(extracted)
        self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))

        # test for XPath
        x_component = XPath(selectors, idname='foo')
        x_component.extract(html.fromstring(src_html))
        extracted = x_component.process()
        extracted_str = html.tostring(extracted)
        self.assertSequenceEqual(normxml(expected_html), normxml(extracted_str))
コード例 #5
0
ファイル: test_components.py プロジェクト: redsymbol/mobilize
    def test_process_idname(self):
        from mobilize.components import CssPath, XPath
        src_html = '''<div>
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
'''
        def prep_component(**kw):
            return c
        # check that default_idname is required if self.idname not defined
        c1 = CssPath('nav')
        c1.extract(html.fromstring(src_html))
        with self.assertRaises(AssertionError):
            c1.process()

        # check that idname argument 
        c2 = CssPath('nav', idname='foo')
        c2.extract(html.fromstring(src_html))
        c2.process() # no AssertionError on this line meanst the test passes

        # check that default_idname supresses the error
        c3 = CssPath('nav')
        c3.extract(html.fromstring(src_html))
        c3.process('foo') # no AssertionError on this line meanst the test passes