コード例 #1
0
ファイル: main.py プロジェクト: spsu/forager
def main(url):
	global DB
	global RQ

	doc = Document(url)
	RQ.push(doc)
	#DB[url] = Document(url)

	try:
		count = 0
		while not RQ.empty():
			doc = RQ.pop()
			url = doc.url

			print "Url '%s' dequeued." % doc.url

			# Don't fetch again if in database.
			if doc.url in DB:
				continue

			DB[url] = doc

			print "Downloading..."
			doc.download()

			# If we just downloaded an external domain, we 
			# don't continue to spider it.
			if not url.isOnDomain('spsu.edu'):
				continue

			if doc.isMissing():
				continue

			urls = doc.getUrls()
			print "%d urls parsed from page" % len(urls)

			for u in urls:
				if u not in DB:
					d = Document(u)
					d.linksIn.append(doc)
					RQ.push(d, 1) # TODO: priority heuristic
				else:
					d = DB[u]
					d.linksIn.append(doc)

				doc.linksOut.append(d)

			count += 1
			if count % SAVE_EVERY == 0:
				save_database()
				count = 1

	except KeyboardInterrupt:
		sys.exit()
		print "Keybord Interrupt, spider terminating."
		save_queue() # XXX This should be fixed.
		return

	except Exception as e:
		import sys, traceback
		print '\n---------------------'
		print "Exception occurred in mainloop"
		print 'Exception: %s' % e
		print '- - - - - - - - - - -'
		traceback.print_tb(sys.exc_info()[2])
		print "\n"
		pass