Example #1
0
from WebCrawler import WebCrawler

startLink = raw_input("Please enter a starting web address: ")
keyword = raw_input("Please enter a keyword to search for: ")
crawler = WebCrawler(keyword, startLink)

while True:
	print "Getting a web page, please wait: ", crawler.currentWebAddress
	crawler.getCurrentPage()
	if crawler.findKeyword():
		break
	crawler.nextPage()

print "Keyword found on the following page:", crawler.currentWebAddress
class TestWebCrawler(unittest.TestCase):
	def setUp(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/Robots_exclusion_standard")

	def tearDown(self):
		del self.spider

	def testInstantiateKeyWord(self):
		self.assertEquals(self.spider.keyWord, "robot")

	def testInstantiateWebAddress(self):
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/Robots_exclusion_standard")

	def testGetCurrentPage(self):
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.currentPage.getcode(), 200)

	def testFindKeyWord(self):
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.findKeyword(), True)

	def testParseLinks(self):
		self.spider.getCurrentPage()
		self.spider.findKeyword()
		self.assertEquals(len(self.spider.links), 98)

	def testGetNextWebpage(self):
		self.spider.getCurrentPage()
		self.spider.findKeyword()
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/MediaWiki:Robots.txt")
		self.assertEquals(len(self.spider.links), 97	)
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.currentPage.getcode(), 200)

	def testParstInternetArchive(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/internet_archive")
		self.spider.getCurrentPage()
		self.spider.findKeyword()

	def testFollowRobotDotTxt(self):
		testLinks = list()
		testLinks.append("/wiki/Special:Search")
		testLinks.append("/wiki/computers")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers")

	def testDontParseDuplicatPage(self):	
		testLinks = list()
		testLinks.append("/wiki/computers")
		testLinks.append("/wiki/computers")
		testLinks.append("/wiki/computers_hard_drives")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers_hard_drives")

	def testParseTheUnready(self): 
		testLinks = list()
		testLinks.append("/wiki/%C3%86thelred_the_Unready")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/%C3%86thelred_the_Unready")