def geturls(urls, concurrency=4, timeout=5): """Get a list of urls. This synchronous call fetches all urls asynchronously and returns the data within.""" from gevent import queue q = queue.Queue() def callback(job): q.put({'url': job.url, 'document': job.data}) handler = SimpleHandler(preprocess=callback) jobs = [Job(url, handler=handler) for url in urls] handler.jobs = jobs crawler = Crawler(handler, concurrency=concurrency, timeout=timeout) crawler.start() return queueitems(q)
def startjobs(jobs, concurrency=4, timeout=2, handler=None): """Fetch a number of jobs. These jobs should have handlers set, as the BaseHandler with noop callbacks will be used. If a handler is provided as kwarg, and it is not a BaseHandler, a SimpleHandler will be created for it.""" if handler and isinstance(handler, BaseHandler): handler = handler elif handler: handler = SimpleHandler(preprocess=handler) else: handler = BaseHandler() handler.jobs = jobs crawler = Crawler(handler, concurrency=concurrency, timeout=timeout) crawler.start()