Ejemplo n.º 1
0
def geturls(urls, concurrency=4, timeout=5):
    """Get a list of urls.  This synchronous call fetches all urls
    asynchronously and returns the data within."""
    from gevent import queue
    q = queue.Queue()
    def callback(job):
        q.put({'url': job.url, 'document': job.data})
    handler = SimpleHandler(preprocess=callback)
    jobs = [Job(url, handler=handler) for url in urls]
    handler.jobs = jobs
    crawler = Crawler(handler, concurrency=concurrency, timeout=timeout)
    crawler.start()
    return queueitems(q)
Ejemplo n.º 2
0
def startjobs(jobs, concurrency=4, timeout=2, handler=None):
    """Fetch a number of jobs.  These jobs should have handlers set, as the
    BaseHandler with noop callbacks will be used.  If a handler is provided
    as kwarg, and it is not a BaseHandler, a SimpleHandler will be created
    for it."""
    if handler and isinstance(handler, BaseHandler):
        handler = handler
    elif handler:
        handler = SimpleHandler(preprocess=handler)
    else:
        handler = BaseHandler()
    handler.jobs = jobs
    crawler = Crawler(handler, concurrency=concurrency, timeout=timeout)
    crawler.start()