Example #1
0
 def testMatchTagWithMoreThenOneAsterisk(self):
     pattern = "<a><b>*</b>*</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup("<a><b>*</b>*</a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # asterisk can match the remaining
     actual = BeautifulSoup("<a><b>*</b><c></c></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # asterisk can match the remaining
     pattern = "<a><b>*</b>*(c)</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     actual = BeautifulSoup("<a><b>*</b><c></c></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # only c is accepted
     pattern = "<a><b>*</b>*(c)</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     actual = BeautifulSoup("<a><b>*</b><d></d></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
Example #2
0
 def testMatchTagWithSubTag(self):
     pattern = "<a><b></b></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup("<a><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # sub tag with different name
     actual = BeautifulSoup("<a><c></c></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # sub tag with more sub tag
     actual = BeautifulSoup("<a><b><c></c></b></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
Example #3
0
 def testMatchTagWithText(self):
     pattern = "<a>text</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup("<a>text</a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # withtou text content
     actual = BeautifulSoup("<a></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # with sub tag
     actual = BeautifulSoup("<a><b></b></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
Example #4
0
 def testMatchTagWithoutAttribute(self):
     # test simple tag
     pattern = "<a></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup("<a></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # different tag name
     actual = BeautifulSoup("<b></b>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # different child count
     actual = BeautifulSoup("<a><b></b></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
Example #5
0
 def testMatchTagWithAttribute(self):
     pattern = "<a name='abc'></a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup('''<a name="abc"></a>''')
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # different attribute name
     actual = BeautifulSoup("<a age='abc'></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # different attribute value
     actual = BeautifulSoup("<a name='abcd'></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # more attributes
     actual = BeautifulSoup("<a name='abc' address='111'></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
Example #6
0
    def testExtractAsteriskValue(self):
        pattern = "<a>*$content</a>"
        _scraper = Scraper(pattern)
        exp = BeautifulSoup(pattern)
        
        # extract text
        actual = BeautifulSoup("<a>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])        

        pattern = "<a>*(b)$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        # asterisk only restrict on tag but not text
        actual = BeautifulSoup("<a>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual('hello world', _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])        
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><c></c></a>")
        self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><b>hello world</b></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        self.assertEqual(BeautifulSoup('<b>hello world</b>').contents[0], _scraper.extractTag(exp.contents[0], actual.contents[0])['content'][0])
        
        # asterisk restrict tag
        actual = BeautifulSoup("<a><b>hello</b><b>world</b></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<b>hello</b>').contents[0], ret['content'][0])        
        self.assertEqual(BeautifulSoup('<b>world</b>').contents[0], ret['content'][1])
        
        # prefix asterisk
        pattern = "<a>*(b)<c></c>$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b><b></b><c></c>hello world</a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual('hello world', ret['content'])

        # prefix asterisk
        pattern = "<a>*(b)<c></c>*$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b><c></c><d>hello world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0])
        
        actual = BeautifulSoup("<a><b></b><c></c><d>hello</d><d>world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        self.assertEqual(BeautifulSoup('<d>hello</d>').contents[0], ret['content'][0])
        self.assertEqual(BeautifulSoup('<d>world</d>').contents[0], ret['content'][1])
        
        # prefix asterisk
        pattern = "<a>*<c></c>*$content</a>"
        _scraper = Scraper(pattern)    
        exp = BeautifulSoup(pattern)
        
        actual = BeautifulSoup("<a><b></b>some text<c></c><d>hello world</d></a>")
        self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
        ret = _scraper.extractTag(exp.contents[0], actual.contents[0])
        
        self.assertEqual(BeautifulSoup('<d>hello world</d>').contents[0], ret['content'][0])
        
Example #7
0
 def testMatchTagWithAsterisk(self):
     pattern = "<a>*</a>"
     _scraper = Scraper(pattern)
     exp = BeautifulSoup(pattern)
     
     # same tag
     actual = BeautifulSoup("<a>*</a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))        
     
     # asterisk can match anything
     actual = BeautifulSoup("<a><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))        
     
     # asterisk can match null
     actual = BeautifulSoup("<a></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))        
     
     # restricted asterisk,only accept tag b, text or null
     pattern = "<a>*(b)</a>"
     exp = BeautifulSoup(pattern)
     
     actual = BeautifulSoup("<a></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     actual = BeautifulSoup("<a>text</a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     actual = BeautifulSoup("<a><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     actual = BeautifulSoup("<a><b></b><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     actual = BeautifulSoup("<a><c></c><b></b></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # restricted asterisk,only accept tag b,tab c, text or null
     pattern = "<a>*(b,c)</a>"
     exp = BeautifulSoup(pattern)
     
     actual = BeautifulSoup("<a><c></c><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     actual = BeautifulSoup("<a><c></c><b></b><d></d></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # restricted prefix asterisk
     pattern = "<a>*<b></b></a>"
     exp = BeautifulSoup(pattern)
     
     actual = BeautifulSoup("<a><c></c><b></b></a>")
     self.assertTrue(_scraper.matchTag(exp.contents[0], actual.contents[0]))
     
     # no tag should appear after tag b
     actual = BeautifulSoup("<a><c></c><b></b><d></d></a>")
     self.assertFalse(_scraper.matchTag(exp.contents[0], actual.contents[0]))