def test_no_attr(self): # if the attribute doesn't exist, returns None example = "<a href=\"http://www.example.com\">Test</a>" ele = html.fragment_fromstring(example) r = Rule.from_json({"name": "url", "attr": "src", "type": "string"}) val = r.data(ele) self.assertIsNone(val)
def test_bad_float(self): # should return None r = Rule.from_json({"name": "url", "attr": "text", "type": "float"}) html_string = "<div>Nothing to see here</div>" ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsNone(val)
def test_type_string(self): example = "<a href=\"http://www.example.com\">Test</a>" ele = html.fragment_fromstring(example) r = Rule.from_json({"name": "url", "attr": "href", "type": "string"}) val = r.data(ele) self.assertIsInstance(val, str) self.assertEqual(val, "http://www.example.com")
def test_from_json(self): rules = [ { "name": "title", "attr": "text", "type": "string" }, { "name": "url", "attr": "href", "type": "string" }, { "name": "img", "attr": "src", "type": "string" }, { "name": "description", "attr": "text", "type": "string" } ] for rule_json in rules: a = Rule.from_json(rule_json) self.assertIsInstance(a, Rule) self.assertEqual(a.name, rule_json["name"]) self.assertEqual(a.attr, rule_json["attr"])
def test_bad_from_json(self): # returns None if either name or rule aren't provided bad_rules = [ { "name": "foo" }, { "attr": "bar" }, { "type": "float" }, {} ] for rule_json in bad_rules: with self.assertRaises(ValueError): Rule.from_json(rule_json)
def test_type_float(self): examples = [("<p data-num=\"3.14159\">Test</p>", "data-num", 3.14159), ("<p>26.2 miles</p>", "text", 26.2), ("<p>In the 98.325th percentile</p>", "text", 98.325)] for example in examples: html_string, attr, expected = example r = Rule.from_json({"name": "url", "attr": attr, "type": "float"}) ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsInstance(val, float) self.assertEqual(val, expected)
def test_type_string(self): example = "<a href=\"http://www.example.com\">Test</a>" ele = html.fragment_fromstring(example) r = Rule.from_json({ "name": "url", "attr": "href", "type": "string" }) val = r.data(ele) self.assertIsInstance(val, str) self.assertEqual(val, "http://www.example.com")
def test_type_int(self): examples = [("<p data-index=\"3\">Test</p>", "data-index", 3), ("<p>15 miles</p>", "text", 15), ("<p>The 18th of July</p>", "text", 18)] for example in examples: html_string, attr, expected = example r = Rule.from_json({"name": "url", "attr": attr, "type": "int"}) ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsInstance(val, int) self.assertEqual(val, expected)
def test_no_attr(self): # if the attribute doesn't exist, returns None example = "<a href=\"http://www.example.com\">Test</a>" ele = html.fragment_fromstring(example) r = Rule.from_json({ "name": "url", "attr": "src", "type": "string" }) val = r.data(ele) self.assertIsNone(val)
def test_bad_int(self): # should return -1 r = Rule.from_json({ "name": "url", "attr": "text", "type": "int" }) html_string = "<div>Nothing to see here</div>" ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsNone(val)
def test_bad_spec_name(self): bad_specs = [{ "type": "all" }, { "type": "all", "name": 0 }, { "type": "all", "name": "" }] for bs in bad_specs: with self.assertRaises(ValueError): AllElement("a", bs, [], [Rule("url", "href", "string")])
def test_bad_spec_values(self): bad_specs = [ { "type": "range", "low": 0, "high": 3 }, # no name { "type": "range", "name": 0, "low": 0, "high": 3 }, # bad name type { "type": "range", "name": "", "low": 0, "high": 3 }, # bad name value { "type": "range", "name": "foo", "high": 3 }, # no low { "type": "range", "name": "foo", "low": "0", "high": 3 }, # bad low value { "type": "range", "name": "foo", "low": 0 }, # no high { "type": "range", "name": "foo", "low": 0, "high": "3" }, # bad high value { "type": "range", "name": "foo", "low": 3, "high": 0 }, # low > high ] for bs in bad_specs: with self.assertRaises(ValueError): RangeElement("a", bs, [], [Rule("url", "href", "string")])
def test_good_spec_values(self): good_specs = [{ "type": "range", "name": "Foo", "low": 0, "high": 3 }, { "type": "range", "name": "Foo", "low": 0, "high": None }] for spec in good_specs: ele = RangeElement("a", spec, [], [Rule("url", "href", "string")]) self.assertIsInstance(ele, RangeElement)
def test_type_float(self): examples = [ ("<p data-num=\"3.14159\">Test</p>", "data-num", 3.14159), ("<p>26.2 miles</p>", "text", 26.2), ("<p>In the 98.325th percentile</p>", "text", 98.325) ] for example in examples: html_string, attr, expected = example r = Rule.from_json({ "name": "url", "attr": attr, "type": "float" }) ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsInstance(val, float) self.assertEqual(val, expected)
def test_type_int(self): examples = [ ("<p data-index=\"3\">Test</p>", "data-index", 3), ("<p>15 miles</p>", "text", 15), ("<p>The 18th of July</p>", "text", 18) ] for example in examples: html_string, attr, expected = example r = Rule.from_json({ "name": "url", "attr": attr, "type": "int" }) ele = html.fragment_fromstring(html_string) val = r.data(ele) self.assertIsInstance(val, int) self.assertEqual(val, expected)
def test_from_json(self): rules = [{ "name": "title", "attr": "text", "type": "string" }, { "name": "url", "attr": "href", "type": "string" }, { "name": "img", "attr": "src", "type": "string" }, { "name": "description", "attr": "text", "type": "string" }] for rule_json in rules: a = Rule.from_json(rule_json) self.assertIsInstance(a, Rule) self.assertEqual(a.name, rule_json["name"]) self.assertEqual(a.attr, rule_json["attr"])
def test_good_spec_index(self): single = SingleElement("a", { "type": "single", "index": 0 }, [], [Rule("url", "href", "string")]) self.assertIsInstance(single, SingleElement)
def test_bad_from_json(self): # returns None if either name or rule aren't provided bad_rules = [{"name": "foo"}, {"attr": "bar"}, {"type": "float"}, {}] for rule_json in bad_rules: with self.assertRaises(ValueError): Rule.from_json(rule_json)
def test_good_spec_name(self): all_e = AllElement("a", { "type": "all", "name": "links" }, [], [Rule("url", "href", "string")]) self.assertIsInstance(all_e, AllElement)
def test_bad_spec_index(self): bad_specs = [{"type": "single"}, {"type": "single", "index": "0"}] for bs in bad_specs: with self.assertRaises(ValueError): SingleElement("a", bs, [], [Rule("url", "href", "string")])