def test_apply(self): rule = PathRule(['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5] ]) result = rule.apply(self.example01.content) self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October ' '2007)')
def test_apply(self): rule = PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5] ]) result = rule.apply(self.example01.content) self.failUnless(result[0] == u' Volume 70 , Issue 16-18 (October ' '2007)')
def test_rule(self): rules = self.ruler.rule(set([self.example01, self.example03])) expected = [ PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 2] ]), PathRule([ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'div', { u'class': u'small-text' }, 7] ]) ] self.failUnless(rules == expected)
def setUp(self): self.rule = PathRule()
class TestPathRule(unittest.TestCase): def setUp(self): self.rule = PathRule() def test_apply_standard_path(self): html = get_soup('acm01.html') path = ['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, 5]] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Volume 70')) def test_apply_wildcard_path(self): html = get_soup('acm01.html') path = ['.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'span', {u'class': u'small-text'}, -1]] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Volume 70')) def test_middle_wildcard_path(self): html = get_soup('acm01.html') path = ['.*', {}, [u'div', {u'class':'authors'}, -1], [u'table', {u'cellpadding':u'0', u'cellspacing':u'0'}, 0], [u'tbody', {}, 0], [u'tr', {}, -1], [u'td', {u'class': u'small-text'}, 1], [u'small', {}, 0]] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) def test_regex_guided_wildcard(self): html = get_soup('acm01.html') path = ['.*Year of Publication.*', {}, [u'td', {u'colspan': u'2', u'class': u'small-text'}, 1], [u'div', {u'class':u'small-text'}, -1]] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Year of')) def test_apply_no_sibling(self): html = BeautifulSoup('<html><body><div id="01" class="div01"><span>' 'Some text</span><p>Paragraph</p></div>' '</body></html>') path = ['.*', {}, (u'div', {u'class': u'div01'}, 1), (u'p', {}, -1)] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0] == "Paragraph") def test_choose_element(self): html = get_soup('springer01.html') path = ['.*', {u"Journal":1}, ["table", {"cellpadding": "0", "cellspacing": "0", "class": "MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControl"}, 0], ["tbody", {}, -1], ["tr", {"valign": "top"}, 0], ["td", {}, 1], ["table", {"cellpadding": "0", "cellspacing": "0"}, 1], ["tbody", {}, -1], ["tr", {}, 1], ["td", {"class": "labelValue"}, 1]] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0] == u'Catalysis Letters')
class TestPathRule(unittest.TestCase): def setUp(self): self.rule = PathRule() def test_apply_standard_path(self): html = get_soup('acm01.html') path = [ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, 5] ] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Volume 70')) def test_apply_wildcard_path(self): html = get_soup('acm01.html') path = [ '.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'span', { u'class': u'small-text' }, -1] ] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Volume 70')) def test_middle_wildcard_path(self): html = get_soup('acm01.html') path = [ '.*', {}, [u'div', { u'class': 'authors' }, -1], [u'table', { u'cellpadding': u'0', u'cellspacing': u'0' }, 0], [u'tbody', {}, 0], [u'tr', {}, -1], [u'td', { u'class': u'small-text' }, 1], [u'small', {}, 0] ] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) def test_regex_guided_wildcard(self): html = get_soup('acm01.html') path = [ '.*Year of Publication.*', {}, [u'td', { u'colspan': u'2', u'class': u'small-text' }, 1], [u'div', { u'class': u'small-text' }, -1] ] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0].startswith(' Year of')) def test_apply_no_sibling(self): html = BeautifulSoup('<html><body><div id="01" class="div01"><span>' 'Some text</span><p>Paragraph</p></div>' '</body></html>') path = ['.*', {}, (u'div', {u'class': u'div01'}, 1), (u'p', {}, -1)] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0] == "Paragraph") def test_choose_element(self): html = get_soup('springer01.html') path = [ '.*', { u"Journal": 1 }, [ "table", { "cellpadding": "0", "cellspacing": "0", "class": "MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControl" }, 0 ], ["tbody", {}, -1], ["tr", { "valign": "top" }, 0], ["td", {}, 1], ["table", { "cellpadding": "0", "cellspacing": "0" }, 1], ["tbody", {}, -1], ["tr", {}, 1], ["td", { "class": "labelValue" }, 1] ] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0] == u'Catalysis Letters')