Ejemplo n.º 1
0
def run(user_crawl, cache=None, num_workers=10, max_connections=10):
    """Run the given crawler
    """
    loop = asyncio.get_event_loop()
    dl_queue = janus.LifoQueue(
        loop=loop
    )  # use a stack for depth first traversal, to spread requests over the website
    scrape_queue = janus.LifoQueue(loop=loop)
    cache_queue = janus.LifoQueue(loop=loop)
    cache = cache or storage.PersistentDict(common.get_hidden_path('cache.db'))

    if CACHE_QUEUE and state.load_queue(cache, dl_queue.sync_q,
                                        scrape_queue.sync_q):
        logger.info('Loaded queue - downloads: {} scrapes: {}'.format(
            dl_queue.sync_q.qsize(), scrape_queue.sync_q.qsize()))
        user_crawl.writer.mode = 'a'
        pass  # successfully loaded the cached queue
    else:
        logger.debug('Default queue')
        cache_queue.sync_q.put(user_crawl.start)

    signal.signal(signal.SIGINT, signal_handler)
    connector = aiohttp.TCPConnector(limit=max_connections)
    # run background thread to load from and save to cache
    proxy_manager = network.ProxyManager(proxy_file='proxies.txt')
    cache_future = loop.run_in_executor(None, threaded_cache, cache,
                                        dl_queue.sync_q, cache_queue.sync_q,
                                        scrape_queue.sync_q)
    # run background thread to manage scraping
    scrape_future = loop.run_in_executor(None, threaded_scrape, user_crawl,
                                         dl_queue.sync_q, cache_queue.sync_q,
                                         scrape_queue.sync_q)
    with aiohttp.ClientSession(loop=loop, connector=connector) as session:
        tasks = [
            crawler(task_id, session, dl_queue.async_q, cache_queue.async_q,
                    scrape_queue.async_q, proxy_manager)
            for task_id in range(num_workers)
        ]
        loop.run_until_complete(asyncio.wait(tasks))
    loop.run_until_complete(cache_future)
    loop.run_until_complete(scrape_future)
    if CACHE_QUEUE:
        logger.info('Caching queue state')
        state.save_queue(cache, dl_queue.sync_q, scrape_queue.sync_q)
    else:
        logger.debug('Clearing queue state')
        state.clear_queue(cache)
    loop.close()
Ejemplo n.º 2
0
    def test_order(self):
        _q = janus.LifoQueue(loop=self.loop)
        q = _q.async_q
        for i in [1, 3, 2]:
            q.put_nowait(i)

        items = [q.get_nowait() for _ in range(3)]
        self.assertEqual([2, 3, 1], items)

        self.assertFalse(_q._sync_mutex.locked())
        _q.close()
        self.loop.run_until_complete(_q.wait_closed())
Ejemplo n.º 3
0
    async def test_order(self):
        _q = janus.LifoQueue()
        q = _q.async_q
        for i in [1, 3, 2]:
            q.put_nowait(i)

        items = [q.get_nowait() for _ in range(3)]
        assert [2, 3, 1] == items

        assert not _q._sync_mutex.locked()
        _q.close()
        await _q.wait_closed()