def run(user_crawl, cache=None, num_workers=10, max_connections=10): """Run the given crawler """ loop = asyncio.get_event_loop() dl_queue = janus.LifoQueue( loop=loop ) # use a stack for depth first traversal, to spread requests over the website scrape_queue = janus.LifoQueue(loop=loop) cache_queue = janus.LifoQueue(loop=loop) cache = cache or storage.PersistentDict(common.get_hidden_path('cache.db')) if CACHE_QUEUE and state.load_queue(cache, dl_queue.sync_q, scrape_queue.sync_q): logger.info('Loaded queue - downloads: {} scrapes: {}'.format( dl_queue.sync_q.qsize(), scrape_queue.sync_q.qsize())) user_crawl.writer.mode = 'a' pass # successfully loaded the cached queue else: logger.debug('Default queue') cache_queue.sync_q.put(user_crawl.start) signal.signal(signal.SIGINT, signal_handler) connector = aiohttp.TCPConnector(limit=max_connections) # run background thread to load from and save to cache proxy_manager = network.ProxyManager(proxy_file='proxies.txt') cache_future = loop.run_in_executor(None, threaded_cache, cache, dl_queue.sync_q, cache_queue.sync_q, scrape_queue.sync_q) # run background thread to manage scraping scrape_future = loop.run_in_executor(None, threaded_scrape, user_crawl, dl_queue.sync_q, cache_queue.sync_q, scrape_queue.sync_q) with aiohttp.ClientSession(loop=loop, connector=connector) as session: tasks = [ crawler(task_id, session, dl_queue.async_q, cache_queue.async_q, scrape_queue.async_q, proxy_manager) for task_id in range(num_workers) ] loop.run_until_complete(asyncio.wait(tasks)) loop.run_until_complete(cache_future) loop.run_until_complete(scrape_future) if CACHE_QUEUE: logger.info('Caching queue state') state.save_queue(cache, dl_queue.sync_q, scrape_queue.sync_q) else: logger.debug('Clearing queue state') state.clear_queue(cache) loop.close()
def test_order(self): _q = janus.LifoQueue(loop=self.loop) q = _q.async_q for i in [1, 3, 2]: q.put_nowait(i) items = [q.get_nowait() for _ in range(3)] self.assertEqual([2, 3, 1], items) self.assertFalse(_q._sync_mutex.locked()) _q.close() self.loop.run_until_complete(_q.wait_closed())
async def test_order(self): _q = janus.LifoQueue() q = _q.async_q for i in [1, 3, 2]: q.put_nowait(i) items = [q.get_nowait() for _ in range(3)] assert [2, 3, 1] == items assert not _q._sync_mutex.locked() _q.close() await _q.wait_closed()