def main(url): global DB global RQ doc = Document(url) RQ.push(doc) #DB[url] = Document(url) try: count = 0 while not RQ.empty(): doc = RQ.pop() url = doc.url print "Url '%s' dequeued." % doc.url # Don't fetch again if in database. if doc.url in DB: continue DB[url] = doc print "Downloading..." doc.download() # If we just downloaded an external domain, we # don't continue to spider it. if not url.isOnDomain('spsu.edu'): continue if doc.isMissing(): continue urls = doc.getUrls() print "%d urls parsed from page" % len(urls) for u in urls: if u not in DB: d = Document(u) d.linksIn.append(doc) RQ.push(d, 1) # TODO: priority heuristic else: d = DB[u] d.linksIn.append(doc) doc.linksOut.append(d) count += 1 if count % SAVE_EVERY == 0: save_database() count = 1 except KeyboardInterrupt: sys.exit() print "Keybord Interrupt, spider terminating." save_queue() # XXX This should be fixed. return except Exception as e: import sys, traceback print '\n---------------------' print "Exception occurred in mainloop" print 'Exception: %s' % e print '- - - - - - - - - - -' traceback.print_tb(sys.exc_info()[2]) print "\n" pass