Beispiel #1
0
#tocrawl.put(sys.argv[1])
#tocrawl.put(sys.argv[2])
#tocrawl.put(sys.argv[3])
tocrawl = []
tocrawl.append(sys.argv[1])
tocrawl.append(sys.argv[2])
tocrawl.append(sys.argv[3])

crawled = set([])

keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
crawlregex = re.compile

#call webVisit class -- used to keep track of visited websites
visit = webVisit()
#DB manager
mdb = DB_comms()
rp = rbp.RobotFileParser()

while 1:
	#rp = rbp.RobotFileParser()
	try:
		print 'doing something'
		crawling = tocrawl.pop(random.randrange(len(tocrawl)))
		print 'something finished'
		print crawling
	except KeyError:
		raise StopIteration
	url = urlparse.urlparse(crawling)
Beispiel #2
0
	def __init__(self):
		self.connection = mdb.connect(host="192.168.0.6", user="******", passwd = "password", db = "crawlbot")
		self.cursor = self.connection.cursor()	
		self.visit = webVisit()