def fetch_url(): suffix = yield q.get() current_url = baseURL + suffix try: if suffix not in fetched: user = suffix[1::] full_name, email, location = yield get_info(current_url) print user, email, location, len(fetched), q.qsize() DBAccessor.insert_info(user, full_name, email, location) urls = yield get_followers_url(current_url) fetched.add(current_url) for new_url in urls: if new_url not in fetched and q.qsize() < queueSize: yield q.put(new_url) finally: q.task_done()
def main(): q = queues.Queue() start = time.time() fetched = DBAccessor.get_all_user_path() @gen.coroutine def fetch_url(): suffix = yield q.get() current_url = baseURL + suffix try: if suffix not in fetched: user = suffix[1::] full_name, email, location = yield get_info(current_url) print user, email, location, len(fetched), q.qsize() DBAccessor.insert_info(user, full_name, email, location) urls = yield get_followers_url(current_url) fetched.add(current_url) for new_url in urls: if new_url not in fetched and q.qsize() < queueSize: yield q.put(new_url) finally: q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() q.put(rootURL) print 'start working...' # Start workers, then wait for the work queue to be empty. for _ in range(concurrency): worker() yield q.join(timeout=timedelta(seconds=2000)) print('Done in %d seconds, fetched %s URLs.' % ( time.time() - start, len(fetched)))
for new_url in urls: if new_url not in fetched and q.qsize() < queueSize: yield q.put(new_url) finally: q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() q.put(rootURL) print 'start working...' # Start workers, then wait for the work queue to be empty. for _ in range(concurrency): worker() yield q.join(timeout=timedelta(seconds=2000)) print('Done in %d seconds, fetched %s URLs.' % ( time.time() - start, len(fetched))) if __name__ == '__main__': import logging logging.basicConfig() io_loop = ioloop.IOLoop.current() io_loop.run_sync(main) DBAccessor.close_db()