Beispiel #1
0
async def do_loop(urls):
    async with aiohttp.ClientSession(loop=loop) as session:

        async def fetch(url, i=0):
            with aiohttp.Timeout(10, loop=session.loop):
                logger.info("fetch(%s): url=%s", i, url)
                async with session.get(url) as response:
                    text = await response.text()
                    urls = [spiderlib.urljoin(url, spiderlib.remove_fragment(new_url)) for new_url in spiderlib.get_links(text)]
                    logger.info("fetched(%s): url=%s", i, url)
                    return urls

        semaphore = asyncio.Semaphore(10)

        async def access(url):
            async with semaphore:
                return await cacher(lambda url: dispatcher.dispatch(fetch, url), url)

        cacher = lib.Cacher(lambda url: url)
        dispatcher = lib.LimitterDispatcher(lambda url: spiderlib.urlparse(url).netloc, limit_count=2)

        q = asyncio.Queue()
        rq = lib.RecQueue(q)
        for url in urls:
            rq.add(access(url))

        await rq.join()
        while not q.empty():
            urls = await q.get()
            for url in urls:
                rq.add(access(url))
            await rq.join()
    print("yay", rq.ec)
Beispiel #2
0
def limited_fetch():
    cacher = lib.Cacher(lambda request: (request.domain, request.args))
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain, limit_count=3)

    def coro_fn(request):
        return dispatcher.dispatch(lib.mock_fetch, request)
        # return cacher(lambda request: dispatcher.dispatch(lib.mock_fetch, request), request)
    return coro_fn
Beispiel #3
0
async def do_loop(requests):
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain,
                                        limit_count=2)
    rq = _RecQueue(
        lambda response:
        [dispatcher.dispatch(lib.mock_fetch, r) for r in response.get_links()])
    for request in requests:
        rq.add(dispatcher.dispatch(lib.mock_fetch, request))
    await rq.join()
Beispiel #4
0
async def do_loop(requests):
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain,
                                        limit_count=2)

    todo = []
    done = []
    st = time.time()
    for r in requests:
        response = await dispatcher.dispatch(lib.mock_fetch, r)
        done.append(r)
        todo.extend(response.get_links())

    while todo:
        r = todo.pop()
        response = await dispatcher.dispatch(lib.mock_fetch, r)
        done.append(r)
        todo.extend(response.get_links())
    logger.info("takes %s, total %s", time.time() - st, len(done))
Beispiel #5
0
async def do_loop(requests):
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain,
                                        limit_count=2)
    semaphore = asyncio.Semaphore(10)
    q = asyncio.Queue()

    async def fetch_urls(request):
        async with semaphore:
            response = await dispatcher.dispatch(lib.mock_fetch, request)
            return response.get_links()

    async def do_work():
        while True:
            item = await q.get()
            links = await fetch_urls(item)
            q.task_done()
            for link in links:
                await q.put(link)

    workers = []
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))
    workers.append(asyncio.ensure_future(do_work()))

    st = time.time()
    for req in requests:
        await q.put(req)

    while not (q.empty() and q._unfinished_tasks <= 0):  # xxx
        logger.info("loop: unfinished=%s, queue=%s", q._unfinished_tasks,
                    len(q._queue))
        await asyncio.sleep(0.2)
    for w in workers:
        w.cancel()
    logger.info("takes %s, total %s", time.time() - st, "x")
Beispiel #6
0
async def do_loop(requests):
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain, limit_count=2)
    semaphore = asyncio.Semaphore(10)

    todo = asyncio.Queue()
    done = asyncio.Queue()

    async def to_future(request):
        async with semaphore:
            return await dispatcher.dispatch(lib.mock_fetch, request)

    async def done_worker():
        while True:
            request = await done.get()
            fut = asyncio.ensure_future(to_future(request))
            todo.put_nowait(fut)

    async def todo_worker():
        def callback(fut):
            todo.task_done()
            response = fut.result()
            for request in response.get_links():
                done.put_nowait(request)
        while True:
            fut = await todo.get()
            fut.add_done_callback(callback)

    workers = []
    workers.append(asyncio.ensure_future(done_worker()))
    workers.append(asyncio.ensure_future(todo_worker()))

    st = time.time()
    for req in requests:
        done.put_nowait(req)

    while not (todo._unfinished_tasks == 0 and len(done._queue) == 0):
        print("loop")
        logger.info("loop: todo=%s, done=%s", todo._unfinished_tasks, len(done._queue))
        await asyncio.sleep(0.2)

    for w in workers:
        w.cancel()
    logger.info("takes %s, total %s", time.time() - st, done._unfinished_tasks)
Beispiel #7
0
async def do_loop(loop):
    cacher = lib.Cacher(lambda url: url)
    dispatcher = lib.LimitterDispatcher(
        lambda url: spiderlib.urlparse(url).netloc, limit_count=2)

    async with aiohttp.ClientSession(loop=loop) as session:

        async def fetch(url):
            with aiohttp.Timeout(10, loop=session.loop):
                async with session.get(url) as response:
                    text = await response.text()
                    urls = [
                        spiderlib.urljoin(url,
                                          spiderlib.remove_fragment(new_url))
                        for new_url in spiderlib.get_links(text)
                    ]
                    return url, urls

        async def consume(supervisor, pipe, i):
            url = await supervisor.pipe.read()
            if url is lib.END_OF_STREAM:
                return
            logger.info("fetch(%s): url=%s", i, url)
            response = await supervisor.consume(
                cacher, lambda request: dispatcher.dispatch(fetch, request),
                url)
            pipe.write_nowait(response)

        async def provide(supervisor, pipe, i):
            st = time.time()
            url, urls = await pipe.read()
            logger.info("fetched(%s): url=%s", i, url)
            for u in urls:
                pipe.write_nowait(u)
            logger.info("takes  (%s): %s", i, time.time() - st)

        pipe = lib.Pipe(asyncio.Queue(), asyncio.Queue())
        supervisor = lib.Supervisor(pipe, provide, consume, concurrency=10)
        base_url = 'http://www.tornadoweb.org/en/stable/'
        base_url2 = 'http://python.org/'
        await supervisor.run_loop([base_url, base_url2])
Beispiel #8
0
async def do_loop(requests):
    dispatcher = lib.LimitterDispatcher(lambda request: request.domain,
                                        limit_count=2)
    cacher = lib.Cacher(lambda request: (request.domain, request.args))
    rq = lib.RecQueue(asyncio.Queue())
    for request in requests:
        rq.add(
            cacher(
                lambda request: dispatcher.dispatch(lib.mock_fetch, request),
                request))

    await rq.join()
    while not rq.empty():
        response = await rq.get()
        for request in response.get_links():
            rq.add(
                cacher(
                    lambda request: dispatcher.dispatch(
                        lib.mock_fetch, request), request))
        await rq.join()
    print(vars(rq.q))
    print("yay", rq.ec)