Example #1
0
 def testExtractTag(self):
     pattern = "<a name='$name'></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # one attribute
     actual = BeautifulSoup("<a name='abc'></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
     
     # one attribute
     actual = BeautifulSoup("<a name='abc' age='27'></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
     
     # two attributes
     pattern = "<a name='$name' age='$age'></a>"
     exp = BeautifulSoup(pattern)
     actual = BeautifulSoup("<a name='abc' age='27'></a>")
     ret =  _scraper.extractTag(exp.contents[0], actual.contents[0])
     self.assertEqual(2, len(ret))
     self.assertEqual('abc', ret['name'])
     self.assertEqual('27', ret['age'])
     
     # get attribute from sub tag
     pattern = "<a><b name='$name'></b></a>"
     exp = BeautifulSoup(pattern)
     
     # one attribute
     actual = BeautifulSoup("<a><b name='abc'></b></a>")
     self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
Example #2
0
    def testExtractAsteriskValue(self):
        pattern = "<a>*$content</a>"
        _scraper = Scraper(pattern)
        exp = BeautifulSoup(pattern)
        
        # extract text
        actual = BeautifulSoup("<a>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])        

        pattern = "<a>*(b)$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        # asterisk only restrict on tag but not text
        actual = BeautifulSoup("<a>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])        
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><c></c></a>")
        self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><b>hello world</b></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual(BeautifulSoup('<b>hello world</b>').contents[0], _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><b>hello</b><b>world</b></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<b>hello</b>').contents[0], ret['content'][0])        
        self.assertEqual(BeautifulSoup('<b>world</b>').contents[0], ret['content'][1])
        
        # prefix asterisk
        pattern = "<a>*(b)<c></c>$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b><b></b><c></c>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual('hello world', ret['content'])

        # prefix asterisk
        pattern = "<a>*(b)<c></c>*$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b><c></c><d>hello world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0])
        
        actual = BeautifulSoup("<a><b></b><c></c><d>hello</d><d>world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<d>hello</d>').contents[0], ret['content'][0])
        self.assertEqual(BeautifulSoup('<d>world</d>').contents[0], ret['content'][1])
        
        # prefix asterisk
        pattern = "<a>*<c></c>*$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b>some text<c></c><d>hello world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        
        self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0])