예제 #1
0
 def test_apply(self):
     rule = PathRule(['.*', {}, [u'td', {u'colspan': u'2',
                               u'class': u'small-text'}, 1],
                      [u'span', {u'class': u'small-text'}, 5]
                     ])
     result = rule.apply(self.example01.content)
     self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October '
                     '2007)')
예제 #2
0
 def test_apply(self):
     rule = PathRule([
         '.*', {}, [u'td', {
             u'colspan': u'2',
             u'class': u'small-text'
         }, 1], [u'span', {
             u'class': u'small-text'
         }, 5]
     ])
     result = rule.apply(self.example01.content)
     self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October '
                     '2007)')
예제 #3
0
 def test_rule(self):
     rules = self.ruler.rule(set([self.example01, self.example03]))
     expected = [
         PathRule([
             '.*', {},
             [u'td', {
                 u'colspan': u'2',
                 u'class': u'small-text'
             }, 1], [u'span', {
                 u'class': u'small-text'
             }, 2]
         ]),
         PathRule([
             '.*', {},
             [u'td', {
                 u'colspan': u'2',
                 u'class': u'small-text'
             }, 1], [u'div', {
                 u'class': u'small-text'
             }, 7]
         ])
     ]
     self.failUnless(rules == expected)
예제 #4
0
 def setUp(self):
     self.rule = PathRule()
예제 #5
0
class TestPathRule(unittest.TestCase):
    def setUp(self):
        self.rule = PathRule()
        
    def test_apply_standard_path(self):
        html = get_soup('acm01.html')
        
        path = ['.*', {},
                [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                [u'span', {u'class': u'small-text'}, 5]]
        self.rule.pattern = path
        
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Volume 70'))
    
    def test_apply_wildcard_path(self):
        html = get_soup('acm01.html')
        
        path = ['.*', {},
                [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                [u'span', {u'class': u'small-text'}, -1]]
        self.rule.pattern = path
        
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Volume 70'))
        
        
    def test_middle_wildcard_path(self):
        html = get_soup('acm01.html')
        path = ['.*', {},
                [u'div', {u'class':'authors'}, -1],
                [u'table', {u'cellpadding':u'0', u'cellspacing':u'0'}, 0],
                [u'tbody', {}, 0],
                [u'tr', {}, -1],
                [u'td', {u'class': u'small-text'}, 1],
                [u'small', {}, 0]]
        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
    
    def test_regex_guided_wildcard(self):
        html = get_soup('acm01.html')
        path = ['.*Year of Publication.*', {},
                [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                [u'div', {u'class':u'small-text'}, -1]]
        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Year of'))
        
    def test_apply_no_sibling(self):
        html = BeautifulSoup('<html><body><div id="01" class="div01"><span>'
                             'Some text</span><p>Paragraph</p></div>'
                             '</body></html>')
        
        path = ['.*', {}, (u'div', {u'class': u'div01'}, 1),
                (u'p', {}, -1)]
        
        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0] == "Paragraph")
    
    def test_choose_element(self):
        html = get_soup('springer01.html')
        
        path = ['.*', {u"Journal":1},
                ["table", {"cellpadding": "0", "cellspacing": "0", "class": "MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControl"}, 0],
                ["tbody", {}, -1],
                ["tr", {"valign": "top"}, 0],
                ["td", {}, 1],
                ["table", {"cellpadding": "0", "cellspacing": "0"}, 1],
                ["tbody", {}, -1],
                ["tr", {}, 1],
                ["td", {"class": "labelValue"}, 1]]
        self.rule.pattern = path
        
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0] == u'Catalysis Letters')
예제 #6
0
 def setUp(self):
     self.rule = PathRule()
예제 #7
0
class TestPathRule(unittest.TestCase):
    def setUp(self):
        self.rule = PathRule()

    def test_apply_standard_path(self):
        html = get_soup('acm01.html')

        path = [
            '.*', {}, [u'td', {
                u'colspan': u'2',
                u'class': u'small-text'
            }, 1], [u'span', {
                u'class': u'small-text'
            }, 5]
        ]
        self.rule.pattern = path

        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Volume 70'))

    def test_apply_wildcard_path(self):
        html = get_soup('acm01.html')

        path = [
            '.*', {}, [u'td', {
                u'colspan': u'2',
                u'class': u'small-text'
            }, 1], [u'span', {
                u'class': u'small-text'
            }, -1]
        ]
        self.rule.pattern = path

        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Volume 70'))

    def test_middle_wildcard_path(self):
        html = get_soup('acm01.html')
        path = [
            '.*', {}, [u'div', {
                u'class': 'authors'
            }, -1],
            [u'table', {
                u'cellpadding': u'0',
                u'cellspacing': u'0'
            }, 0], [u'tbody', {}, 0], [u'tr', {}, -1],
            [u'td', {
                u'class': u'small-text'
            }, 1], [u'small', {}, 0]
        ]
        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)

    def test_regex_guided_wildcard(self):
        html = get_soup('acm01.html')
        path = [
            '.*Year of Publication.*', {},
            [u'td', {
                u'colspan': u'2',
                u'class': u'small-text'
            }, 1], [u'div', {
                u'class': u'small-text'
            }, -1]
        ]
        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0].startswith(' Year of'))

    def test_apply_no_sibling(self):
        html = BeautifulSoup('<html><body><div id="01" class="div01"><span>'
                             'Some text</span><p>Paragraph</p></div>'
                             '</body></html>')

        path = ['.*', {}, (u'div', {u'class': u'div01'}, 1), (u'p', {}, -1)]

        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0] == "Paragraph")

    def test_choose_element(self):
        html = get_soup('springer01.html')

        path = [
            '.*', {
                u"Journal": 1
            },
            [
                "table", {
                    "cellpadding":
                    "0",
                    "cellspacing":
                    "0",
                    "class":
                    "MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControl"
                }, 0
            ], ["tbody", {}, -1], ["tr", {
                "valign": "top"
            }, 0], ["td", {}, 1],
            ["table", {
                "cellpadding": "0",
                "cellspacing": "0"
            }, 1], ["tbody", {}, -1], ["tr", {}, 1],
            ["td", {
                "class": "labelValue"
            }, 1]
        ]
        self.rule.pattern = path

        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0] == u'Catalysis Letters')