def processCrawlJob(crawlJob): DB.removeFromCrawlQueue(crawlJob.url) resp = callAgent(crawlJob) processAgentResponse(resp) DB.addToCrawlQueue(crawlJob.url) crawlJob.success = True return crawlJob
if len(cr['serverErrors']) > 0 or len(cr['browserErrors']) > 0: cr['errorsPresent'] = True def processCrawlJob(crawlJob): DB.removeFromCrawlQueue(crawlJob.url) resp = callAgent(crawlJob) processAgentResponse(resp) DB.addToCrawlQueue(crawlJob.url) crawlJob.success = True return crawlJob running = True if __name__ == '__main__': pool = eventlet.GreenPool(size=4*len(agents)) DB.ensure_indexes() if not DB.inCrawlQueue(config['startUrl']): DB.addToCrawlQueue(config['startUrl']) while running: for crawlDoc in DB.getCrawlQueue(): if urlAllowed(crawlDoc['url']): for agent in agents: job = CrawlJob(agent['name'], agent['url'], crawlDoc['url']) pool.spawn(processCrawlJob, job) else: print "Removing URL: ", crawlDoc['url'] DB.removeFromCrawlQueue(crawlDoc['url'])