コード例 #1
0
    def fetch_url():

        suffix = yield q.get()
        current_url = baseURL + suffix

        try:
            if suffix not in fetched:

                user = suffix[1::]
                full_name, email, location = yield get_info(current_url)

                print user, email, location, len(fetched), q.qsize()
                DBAccessor.insert_info(user, full_name, email, location)

                urls = yield get_followers_url(current_url)
                fetched.add(current_url)

                for new_url in urls:
                    if new_url not in fetched and q.qsize() < queueSize:
                        yield q.put(new_url)

        finally:
            q.task_done()
コード例 #2
0
def main():
    q = queues.Queue()
    start = time.time()
    fetched = DBAccessor.get_all_user_path()

    @gen.coroutine
    def fetch_url():

        suffix = yield q.get()
        current_url = baseURL + suffix

        try:
            if suffix not in fetched:

                user = suffix[1::]
                full_name, email, location = yield get_info(current_url)

                print user, email, location, len(fetched), q.qsize()
                DBAccessor.insert_info(user, full_name, email, location)

                urls = yield get_followers_url(current_url)
                fetched.add(current_url)

                for new_url in urls:
                    if new_url not in fetched and q.qsize() < queueSize:
                        yield q.put(new_url)

        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(rootURL)

    print 'start working...'

    # Start workers, then wait for the work queue to be empty.
    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=2000))
    print('Done in %d seconds, fetched %s URLs.' % (
        time.time() - start, len(fetched)))
コード例 #3
0
                for new_url in urls:
                    if new_url not in fetched and q.qsize() < queueSize:
                        yield q.put(new_url)

        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(rootURL)

    print 'start working...'

    # Start workers, then wait for the work queue to be empty.
    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=2000))
    print('Done in %d seconds, fetched %s URLs.' % (
        time.time() - start, len(fetched)))


if __name__ == '__main__':
    import logging
    logging.basicConfig()
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(main)
    DBAccessor.close_db()