def parse_link(self,base_url,html): soup = BeautifulSoup(html) self.pipe.process(self,base_url,soup) depth = base_url.depth + 1 for ref in soup.findAll(self.rule,href=True): url = urlrule.get_abs_url(base_url.url,ref["href"]) if urlrule.match(url): self.queue.add_link(Link(url,depth))
def test_match_can_block_uncomplete_url(self): url = "www.google.com" url2 = "/testhttp://.html" url3 = "test.html" self.assertFalse(urlrule.match(url),urlrule.match(url2)) self.assertFalse(urlrule.match(url3))
def test_match_can_block_image_file(self): url = "http://www.google.com.tw/image.jpg" self.assertFalse(urlrule.match(url))