class TestPathRuler(TestRuler): def setUp(self): super(TestPathRuler, self).setUp() self.ruler = PathRuler() def test_get_element_attrs(self): attrs = self.ruler._get_element_attrs(self.element01) self.failUnless(len(attrs) == 1) self.failUnless(attrs['class'] == 'mediumb-text') def test_is_unique(self): description01 = self.ruler._get_element_description(self.element01) self.failUnless(self.ruler._is_unique(self.soup01, description01)) description02 = self.ruler._get_element_description(self.element02) self.failIf(self.ruler._is_unique(self.soup01, description02)) def test_get_sibling_number(self): number = self.ruler._get_sibling_number(self.element03) self.failUnless(number == 2) pass def test_get_element_path(self): path = self.ruler._get_element_path(self.soup01, self.element02) self.failUnless(len(path) == 3) def test_rule_element(self): elements = self.example01.content.findAll('span', {u'class':u'small-text'}) rule = self.ruler._rule_element(self.example01, elements[1]) expected = [{u'table of contents': 1}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1]] self.failUnless(rule.pattern == expected) def test_should_merge(self): rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {}, 1]]) rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02':'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {}, 1], [u'c', {}, 2]]) rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02':'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule([[u'a', {u'a01':u'x'}, 0], [u'b', {u'b02':'x'}, 1]]) rule02 = Rule([[u'a', {u'a01':u'x'}, 1], [u'b', {u'b02':'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == True) def test_rule_example(self): rules = self.ruler._rule_example(self.example01) pattern01 = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 2]] pattern02 = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'div', {u'class': u'small-text'}, 7]] self.failUnless(len(rules) == 2) self.failUnless(rules[0].pattern == pattern01) self.failUnless(rules[1].pattern == pattern02) def test_merge_patterns(self): general = [{}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5]] # Merge patterns with different length paths pattern = [{}, [u'span', {u'class': u'big-text'}, 5]] self.failUnlessRaises(ValueError, self.ruler._merge_patterns, general, pattern) # Merge patterns with different attributes list pattern = [{}, [u'td', {u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5]] result = self.ruler._merge_patterns(general, pattern) expected = [{}, [u'td', {u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5]] self.failUnless(result == expected, "Different attributes") # Merge patters with different attribute values pattern = [{}, [u'td', {u'class': u'small-text'}, 1], [u'span', {u'class': u'big-text'}, 5]] result = self.ruler._merge_patterns(general, pattern) expected = [{}, ['td', {u'class': u'small-text'}, 1], [u'span', {}, 5]] self.failUnless(result == expected, "Different attribute values") def test_rule(self): rules = self.ruler.rule(set([self.example01, self.example03])) expected = [PathRule(['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 2]]), PathRule(['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'div', {u'class': u'small-text'}, 7]])] self.failUnless(rules == expected) def test_get_content_elements(self): elements = self.ruler._get_content_elements(self.example01.value, self.example01.content) expected = [u' Volume 70 , Issue 16-18 (October 2007)', u' Year of Publication: 2007 '] self.failUnless(len(elements) == 2) self.failUnless(elements == expected) elements = self.ruler._get_content_elements(self.example05.value, self.example05.content) expected = [u'149-154'] self.failUnless(len(elements) == 1) self.failUnless(elements == expected) def test_get_invalid_content_element(self): example = Example(value='random text', content=BeautifulSoup('')) elements = self.ruler._get_content_elements(example.value, example.content) self.failIf(elements) def test_apply(self): rule = PathRule(['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5] ]) result = rule.apply(self.example01.content) self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October ' '2007)')
def setUp(self): super(TestPathRuler, self).setUp() self.ruler = PathRuler()
class TestPathRuler(TestRuler): def setUp(self): super(TestPathRuler, self).setUp() self.ruler = PathRuler() def test_get_element_attrs(self): attrs = self.ruler._get_element_attrs(self.element01) self.failUnless(len(attrs) == 1) self.failUnless(attrs['class'] == 'mediumb-text') def test_is_unique(self): description01 = self.ruler._get_element_description(self.element01) self.failUnless(self.ruler._is_unique(self.soup01, description01)) description02 = self.ruler._get_element_description(self.element02) self.failIf(self.ruler._is_unique(self.soup01, description02)) def test_get_sibling_number(self): number = self.ruler._get_sibling_number(self.element03) self.failUnless(number == 2) pass def test_get_element_path(self): path = self.ruler._get_element_path(self.soup01, self.element02) self.failUnless(len(path) == 3) def test_rule_element(self): elements = self.example01.content.findAll('span', {u'class': u'small-text'}) rule = self.ruler._rule_element(self.example01, elements[1]) expected = [{ u'table of contents': 1 }, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1]] self.failUnless(rule.pattern == expected) def test_should_merge(self): rule01 = Rule([[u'a', {u'a01': u'x'}, 0], [u'b', {}, 1]]) rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02': 'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule([[u'a', { u'a01': u'x' }, 0], [u'b', {}, 1], [u'c', {}, 2]]) rule02 = Rule([[u'a', {}, 1], [u'b', {u'b02': 'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule([[u'a', {u'a01': u'x'}, 0], [u'b', {u'b02': 'x'}, 1]]) rule02 = Rule([[u'a', {u'a01': u'x'}, 1], [u'b', {u'b02': 'x'}, 2]]) should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == True) def test_rule_example(self): rules = self.ruler._rule_example(self.example01) pattern01 = [{}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 2]] pattern02 = [{}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'div', { u'class': u'small-text' }, 7]] self.failUnless(len(rules) == 2) self.failUnless(rules[0].pattern == pattern01) self.failUnless(rules[1].pattern == pattern02) def test_merge_patterns(self): general = [{}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5]] # Merge patterns with different length paths pattern = [{}, [u'span', {u'class': u'big-text'}, 5]] self.failUnlessRaises(ValueError, self.ruler._merge_patterns, general, pattern) # Merge patterns with different attributes list pattern = [{}, [u'td', { u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5]] result = self.ruler._merge_patterns(general, pattern) expected = [{}, [u'td', { u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5]] self.failUnless(result == expected, "Different attributes") # Merge patters with different attribute values pattern = [{}, [u'td', { u'class': u'small-text' }, 1], [u'span', { u'class': u'big-text' }, 5]] result = self.ruler._merge_patterns(general, pattern) expected = [{}, ['td', {u'class': u'small-text'}, 1], [u'span', {}, 5]] self.failUnless(result == expected, "Different attribute values") def test_rule(self): rules = self.ruler.rule(set([self.example01, self.example03])) expected = [ PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 2] ]), PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'div', { u'class': u'small-text' }, 7] ]) ] self.failUnless(rules == expected) def test_get_content_elements(self): elements = self.ruler._get_content_elements(self.example01.value, self.example01.content) expected = [ u' Volume 70 , Issue 16-18 (October 2007)', u' Year of Publication: 2007 ' ] self.failUnless(len(elements) == 2) self.failUnless(elements == expected) elements = self.ruler._get_content_elements(self.example05.value, self.example05.content) expected = [u'149-154'] self.failUnless(len(elements) == 1) self.failUnless(elements == expected) def test_get_invalid_content_element(self): example = Example(value='random text', content=BeautifulSoup('')) elements = self.ruler._get_content_elements(example.value, example.content) self.failIf(elements) def test_apply(self): rule = PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5] ]) result = rule.apply(self.example01.content) self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October ' '2007)')