예제 #1
0
class TestPathRuler(TestRuler):
        
    def setUp(self):
        super(TestPathRuler, self).setUp()
        self.ruler = PathRuler()
  
    def test_get_element_attrs(self):
        attrs = self.ruler._get_element_attrs(self.element01)
        self.failUnless(len(attrs) == 1)
        self.failUnless(attrs['class'] == 'mediumb-text')

    def test_is_unique(self):
        description01 = self.ruler._get_element_description(self.element01)
        self.failUnless(self.ruler._is_unique(self.soup01, description01))

        description02 = self.ruler._get_element_description(self.element02)
        self.failIf(self.ruler._is_unique(self.soup01, description02))
    
    def test_get_sibling_number(self):
        number = self.ruler._get_sibling_number(self.element03)
        self.failUnless(number == 2)
        pass

    def test_get_element_path(self):
        path = self.ruler._get_element_path(self.soup01, self.element02)
        self.failUnless(len(path) == 3)

    def test_rule_element(self):
        elements = self.example01.content.findAll('span',
                                                  {u'class':u'small-text'})
        rule = self.ruler._rule_element(self.example01, elements[1])
        expected = [{u'table of contents': 1},
                    [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1]]
        self.failUnless(rule.pattern == expected)
    
    def test_should_merge(self):
        rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {}, 1]])
        rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02':'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)
        
        rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {}, 1], [u'c', {}, 2]])
        rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02':'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)
        
        rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {u'b02':'x'}, 1]])
        rule02 = Rule([[u'a', {u'a01':u'x'}, 1], [u'b', {u'b02':'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == True)
    
    def test_rule_example(self):
        rules = self.ruler._rule_example(self.example01)
        pattern01 = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                     [u'span', {u'class': u'small-text'}, 2]]
        pattern02 = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                     [u'div', {u'class': u'small-text'}, 7]]
        self.failUnless(len(rules) == 2) 
        self.failUnless(rules[0].pattern == pattern01)
        self.failUnless(rules[1].pattern == pattern02)           
    
    def test_merge_patterns(self):
        general = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1],
                    [u'span', {u'class': u'small-text'}, 5]]

        # Merge patterns with different length paths
        pattern = [{}, [u'span', {u'class': u'big-text'}, 5]] 
        self.failUnlessRaises(ValueError, self.ruler._merge_patterns, general,
                              pattern)

        # Merge patterns with different attributes list
        pattern = [{}, [u'td', {u'class': u'small-text'}, 1],
                  [u'span', {u'class': u'small-text'}, 5]]         
        result = self.ruler._merge_patterns(general, pattern)
        expected = [{}, [u'td', {u'class': u'small-text'}, 1],
                   [u'span', {u'class': u'small-text'}, 5]]
        self.failUnless(result == expected, "Different attributes")
        
        # Merge patters with different attribute values
        pattern = [{}, [u'td', {u'class': u'small-text'}, 1],
                   [u'span', {u'class': u'big-text'}, 5]] 
        result = self.ruler._merge_patterns(general, pattern)
        expected = [{}, ['td', {u'class': u'small-text'}, 1],
                   [u'span', {}, 5]]
        self.failUnless(result == expected, "Different attribute values")
        
    def test_rule(self):
        rules = self.ruler.rule(set([self.example01, self.example03]))
        expected = [PathRule(['.*', {}, [u'td', {u'colspan': u'2',
                                       u'class': u'small-text'}, 1],
                              [u'span', {u'class': u'small-text'}, 2]]),
                    PathRule(['.*', {}, [u'td', {u'colspan': u'2',
                                       u'class': u'small-text'}, 1],
                              [u'div', {u'class': u'small-text'}, 7]])]
        self.failUnless(rules == expected)

    def test_get_content_elements(self):
        elements = self.ruler._get_content_elements(self.example01.value,
                                                    self.example01.content) 
        expected = [u' Volume 70 , Issue 16-18 (October 2007)',
                    u' Year of Publication: 2007 ']
        self.failUnless(len(elements) == 2)
        self.failUnless(elements == expected)
        
        elements = self.ruler._get_content_elements(self.example05.value,
                                                    self.example05.content)
        expected = [u'149-154']
        self.failUnless(len(elements) == 1)
        self.failUnless(elements == expected)
        
    def test_get_invalid_content_element(self):
        example = Example(value='random text', content=BeautifulSoup(''))
        elements = self.ruler._get_content_elements(example.value,
                                                    example.content)
        self.failIf(elements)
        
    def test_apply(self):
        rule = PathRule(['.*', {}, [u'td', {u'colspan': u'2',
                                  u'class': u'small-text'}, 1],
                         [u'span', {u'class': u'small-text'}, 5]
                        ])
        result = rule.apply(self.example01.content)
        self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October '
                        '2007)')
예제 #2
0
 def setUp(self):
     super(TestPathRuler, self).setUp()
     self.ruler = PathRuler()
예제 #3
0
class TestPathRuler(TestRuler):
    def setUp(self):
        super(TestPathRuler, self).setUp()
        self.ruler = PathRuler()

    def test_get_element_attrs(self):
        attrs = self.ruler._get_element_attrs(self.element01)
        self.failUnless(len(attrs) == 1)
        self.failUnless(attrs['class'] == 'mediumb-text')

    def test_is_unique(self):
        description01 = self.ruler._get_element_description(self.element01)
        self.failUnless(self.ruler._is_unique(self.soup01, description01))

        description02 = self.ruler._get_element_description(self.element02)
        self.failIf(self.ruler._is_unique(self.soup01, description02))

    def test_get_sibling_number(self):
        number = self.ruler._get_sibling_number(self.element03)
        self.failUnless(number == 2)
        pass

    def test_get_element_path(self):
        path = self.ruler._get_element_path(self.soup01, self.element02)
        self.failUnless(len(path) == 3)

    def test_rule_element(self):
        elements = self.example01.content.findAll('span',
                                                  {u'class': u'small-text'})
        rule = self.ruler._rule_element(self.example01, elements[1])
        expected = [{
            u'table of contents': 1
        }, [u'td', {
            u'colspan': u'2',
            u'class': u'small-text'
        }, 1]]
        self.failUnless(rule.pattern == expected)

    def test_should_merge(self):
        rule01 = Rule([[u'a', {u'a01': u'x'}, 0], [u'b', {}, 1]])
        rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02': 'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)

        rule01 = Rule([[u'a', {
            u'a01': u'x'
        }, 0], [u'b', {}, 1], [u'c', {}, 2]])
        rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02': 'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)

        rule01 = Rule([[u'a', {u'a01': u'x'}, 0], [u'b', {u'b02': 'x'}, 1]])
        rule02 = Rule([[u'a', {u'a01': u'x'}, 1], [u'b', {u'b02': 'x'}, 2]])
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == True)

    def test_rule_example(self):
        rules = self.ruler._rule_example(self.example01)
        pattern01 = [{},
                     [u'td', {
                         u'colspan': u'2',
                         u'class': u'small-text'
                     }, 1], [u'span', {
                         u'class': u'small-text'
                     }, 2]]
        pattern02 = [{},
                     [u'td', {
                         u'colspan': u'2',
                         u'class': u'small-text'
                     }, 1], [u'div', {
                         u'class': u'small-text'
                     }, 7]]
        self.failUnless(len(rules) == 2)
        self.failUnless(rules[0].pattern == pattern01)
        self.failUnless(rules[1].pattern == pattern02)

    def test_merge_patterns(self):
        general = [{}, [u'td', {
            u'colspan': u'2',
            u'class': u'small-text'
        }, 1], [u'span', {
            u'class': u'small-text'
        }, 5]]

        # Merge patterns with different length paths
        pattern = [{}, [u'span', {u'class': u'big-text'}, 5]]
        self.failUnlessRaises(ValueError, self.ruler._merge_patterns, general,
                              pattern)

        # Merge patterns with different attributes list
        pattern = [{}, [u'td', {
            u'class': u'small-text'
        }, 1], [u'span', {
            u'class': u'small-text'
        }, 5]]
        result = self.ruler._merge_patterns(general, pattern)
        expected = [{}, [u'td', {
            u'class': u'small-text'
        }, 1], [u'span', {
            u'class': u'small-text'
        }, 5]]
        self.failUnless(result == expected, "Different attributes")

        # Merge patters with different attribute values
        pattern = [{}, [u'td', {
            u'class': u'small-text'
        }, 1], [u'span', {
            u'class': u'big-text'
        }, 5]]
        result = self.ruler._merge_patterns(general, pattern)
        expected = [{}, ['td', {u'class': u'small-text'}, 1], [u'span', {}, 5]]
        self.failUnless(result == expected, "Different attribute values")

    def test_rule(self):
        rules = self.ruler.rule(set([self.example01, self.example03]))
        expected = [
            PathRule([
                '.*', {},
                [u'td', {
                    u'colspan': u'2',
                    u'class': u'small-text'
                }, 1], [u'span', {
                    u'class': u'small-text'
                }, 2]
            ]),
            PathRule([
                '.*', {},
                [u'td', {
                    u'colspan': u'2',
                    u'class': u'small-text'
                }, 1], [u'div', {
                    u'class': u'small-text'
                }, 7]
            ])
        ]
        self.failUnless(rules == expected)

    def test_get_content_elements(self):
        elements = self.ruler._get_content_elements(self.example01.value,
                                                    self.example01.content)
        expected = [
            u' Volume 70 , Issue 16-18 (October 2007)',
            u' Year of Publication: 2007 '
        ]
        self.failUnless(len(elements) == 2)
        self.failUnless(elements == expected)

        elements = self.ruler._get_content_elements(self.example05.value,
                                                    self.example05.content)
        expected = [u'149-154']
        self.failUnless(len(elements) == 1)
        self.failUnless(elements == expected)

    def test_get_invalid_content_element(self):
        example = Example(value='random text', content=BeautifulSoup(''))
        elements = self.ruler._get_content_elements(example.value,
                                                    example.content)
        self.failIf(elements)

    def test_apply(self):
        rule = PathRule([
            '.*', {}, [u'td', {
                u'colspan': u'2',
                u'class': u'small-text'
            }, 1], [u'span', {
                u'class': u'small-text'
            }, 5]
        ])
        result = rule.apply(self.example01.content)
        self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October '
                        '2007)')