Exemple #1
0
	def parse_link(self,base_url,html):
		soup = BeautifulSoup(html)
		self.pipe.process(self,base_url,soup)
		depth = base_url.depth + 1
		for ref in soup.findAll(self.rule,href=True):
			url = urlrule.get_abs_url(base_url.url,ref["href"])
			if urlrule.match(url):
				self.queue.add_link(Link(url,depth))
Exemple #2
0
	def test_match_can_block_uncomplete_url(self):
		url = "www.google.com"
		url2 = "/testhttp://.html"
		url3 = "test.html"
		self.assertFalse(urlrule.match(url),urlrule.match(url2))
		self.assertFalse(urlrule.match(url3))
Exemple #3
0
	def test_match_can_block_image_file(self):
		url = "http://www.google.com.tw/image.jpg"
		self.assertFalse(urlrule.match(url))