def testExtractTag(self): pattern = "<a name='$name'></a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # one attribute actual = BeautifulSoup("<a name='abc'></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name']) # one attribute actual = BeautifulSoup("<a name='abc' age='27'></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name']) # two attributes pattern = "<a name='$name' age='$age'></a>" exp = BeautifulSoup(pattern) actual = BeautifulSoup("<a name='abc' age='27'></a>") ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(2, len(ret)) self.assertEqual('abc', ret['name']) self.assertEqual('27', ret['age']) # get attribute from sub tag pattern = "<a><b name='$name'></b></a>" exp = BeautifulSoup(pattern) # one attribute actual = BeautifulSoup("<a><b name='abc'></b></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
def testExtractAsteriskValue(self): pattern = "<a>*$content</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # extract text actual = BeautifulSoup("<a>hello world</a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0]) pattern = "<a>*(b)$content</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # asterisk only restrict on tag but not text actual = BeautifulSoup("<a>hello world</a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0]) # asterisk restrict tag actual = BeautifulSoup("<a><c></c></a>") self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0])) # asterisk restrict tag actual = BeautifulSoup("<a><b>hello world</b></a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) self.assertEqual(BeautifulSoup('<b>hello world</b>').contents[0], _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0]) # asterisk restrict tag actual = BeautifulSoup("<a><b>hello</b><b>world</b></a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(BeautifulSoup('<b>hello</b>').contents[0], ret['content'][0]) self.assertEqual(BeautifulSoup('<b>world</b>').contents[0], ret['content'][1]) # prefix asterisk pattern = "<a>*(b)<c></c>$content</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) actual = BeautifulSoup("<a><b></b><b></b><c></c>hello world</a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual('hello world', ret['content']) # prefix asterisk pattern = "<a>*(b)<c></c>*$content</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) actual = BeautifulSoup("<a><b></b><c></c><d>hello world</d></a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0]) actual = BeautifulSoup("<a><b></b><c></c><d>hello</d><d>world</d></a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(BeautifulSoup('<d>hello</d>').contents[0], ret['content'][0]) self.assertEqual(BeautifulSoup('<d>world</d>').contents[0], ret['content'][1]) # prefix asterisk pattern = "<a>*<c></c>*$content</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) actual = BeautifulSoup("<a><b></b>some text<c></c><d>hello world</d></a>") self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0])) ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0])