def main(): q = queues.Queue() start = time.time() fetching, fetched = set(), set() fetched_links = set() sq = queues.Queue() @gen.coroutine def fetch_url(): current_url = yield q.get() try: if current_url in fetching: return # print('fetching %s' % current_url) fetching.add(current_url) links = yield get_links_from_url(current_url) for link in links: fetched_links.add(link) fetched.add(current_url) finally: q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() for base_url in base_url_list: q.put(base_url) # Start workers, then wait for the work queue to be empty. for cur_page in xrange(1, concurrency): worker() yield q.join(timeout=timedelta(seconds=300)) assert fetching == fetched print('Done in %d seconds, fetched %s URLs.' % ( time.time() - start, len(fetched))) @gen.coroutine def sworker(): while True: current_url = yield sq.get() try: # print('fetching %s' % current_url) fetching.add(current_url) recruit_info = yield get_res_from_url(current_url) print recruit_info finally: sq.task_done() # # deal all links, get job info for link in fetched_links: sq.put(link) for link in fetched_links: sworker() yield sq.join(timeout=timedelta(seconds=600)) print '--OVER--'
def __init__(self, context, detach=False): """ Initialize the Engine with the executor context :param context: the parade context to boot engine :param detach: the flag to indicate the engine is executed in detached mode or not :return: the initialized engine """ self.context = context # the thread pool to convert block execution of task into async process self.thread_pool = ThreadPoolExecutor(4) self.wait_queue = queues.Queue() self.exec_queue = queues.Queue() def engine_loop(): _ioloop = ioloop.IOLoop.current() _ioloop.add_callback(self.daemon_loop) _ioloop.start() # if the detached mode is enabled # use a seperated thread to boot the io-loop self.loop_thread = None if detach: logger.debug("engine running in detach mode") self.loop_thread = threading.Thread(target=engine_loop) self.loop_thread.start()
def __init__(self, base_url, concurrency=10): self.q = queues.Queue() self.q2 = queues.Queue() self.start = time.time() self.fetching = set() self.fetched = set() self.base_url = base_url self.concurrency = concurrency self.i = 0
def main(): q = queues.Queue() start = time.time() fetching, fetched = set(), set() newq = queues.Queue() @tornado.gen.coroutine def fetch_url(): current_url = yield q.get() try: if current_url in fetching: return print('fetching %s' % current_url) fetching.add(current_url) urls = yield get_pictures_from_url(current_url) fetched.add(current_url) for new_url in urls: # Only follow links beneath the base URL if new_url.startswith('base_url'): yield q.put(new_url) finally: q.task_done() @tornado.gen.coroutine def download_pic(): current_url = yield newq.get() try: print(current_url) finally: newq.task_done() @tornado.gen.coroutine def downloader(): while True: yield download_pic() @tornado.gen.coroutine def worker(): while True: yield fetch_url() q.put(base_url) # Start workers, then wait for the work queue to be empty. for _ in range(concurrency): worker() yield q.join(timeout=timedelta(seconds=300)) assert fetching == fetched print('Done in %d seconds, fetched %s URLs.' % (time.time() - start, len(fetched)))
def mainx(): start = time.time() fetched = 0 client = MongoClient('mongodb://localhost:27017/') db = client['posts'] cookies = { 'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7' 'C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add', 'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7' 'C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92' } post_queue = queues.Queue() page_queue = queues.Queue() for i in range(1, 69): page_url = "http://python.jobbole.com/all-posts/page/{page}/".format( page=i) page_queue.put(page_url) print(page_url) # @gen.coroutine def posts_url_worker(): while True: page = yield page_queue.get() urls = yield get_posts_url_from_page(page) for u in urls: post_queue.put(u) page_queue.task_done() @gen.coroutine def post_data_worker(): while True: url = yield post_queue.get() post = yield get_post_data_from_url(url, cookies) nonlocal fetched fetched += 1 db.posts.insert_one(post) post_queue.task_done() for _ in range(concurrency): posts_url_worker() for _ in range(concurrency): post_data_worker() yield page_queue.join() yield post_queue.join() # yield q.join(timeout=timedelta(seconds=300)) print('爬取%s 篇文章,总共耗时%d 秒.' % (fetched, time.time() - start))
def elysium(mocker): config = Config(mocker.Mock()) sentry_wrapper = mocker.Mock() mocker.patch.object(_AppsCache, 'make_control_ch', return_value=make_mock_channel_with(0)) node = mocker.Mock() node.start_app = mocker.Mock(side_effect=make_mock_channels_list_with( xrange(count_apps(to_run_apps)))) node.control = mocker.Mock(side_effect=make_mock_channels_list_with( xrange(count_apps(to_run_apps)))) submitter = mocker.Mock() submitter.post_committed_state = mocker.Mock( return_value=make_mock_channel_with(0)) return burlak.AppsElysium( Context( LoggerSetup(make_logger_mock(mocker), False), config, '0', sentry_wrapper, mocker.Mock(), ), CommittedState(), node, queues.Queue(), submitter)
def main(): q = queues.Queue() start = time.time() fetching, fetched = set(), set() @gen.coroutine def fetch_url(): current_url = yield q.get() stream = yield TCPClient().connect(options.host, options.port) print("Connected (#%s)" % current_url) yield stream.write(("%d\n" % current_url).encode()) reply = yield stream.read_until(b"\n") print("Response from server (#%s)" % current_url) q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() for _ in range(10000): q.put(_) # Start workers, then wait for the work queue to be empty. for _ in range(concurrency): worker() yield q.join(timeout=timedelta(seconds=300))
def __init__(self, access_key, brain_api_url, driver, recording_file): self.access_key = access_key self.brain_api_url = brain_api_url self.driver = driver self.recording_file = recording_file if self.recording_file: self.recording_queue = queues.Queue()
def call(self, api_function, *args, **kwargs): """Call a boto2 or boto3 api function. Simply invoke this with an api method and its args and kwargs. The api function call is coordinated synchronously across all calls to this `api_call_queue`, and they will run in order. I.e., if you invoke this right after another coroutine invoked this, it will block until that other coroutine's call completed. If the call ends up being rate limited, it will backoff and try again continuously. By serializing the api calls to the specific method, this prevents a stampeding herd effect that you'd normally get with infinite retries. There is no limit or timeout on how many times it will retry, so in practice this may block an extremely long time if all responses are rate limit exceptions. Any other failures, like connection timeouts or read timeouts, will bubble up immediately and won't be retried here. """ result_queue = queues.Queue(maxsize=1) yield self._queue.put((result_queue, api_function, args, kwargs)) result = yield result_queue.get() if isinstance(result, Exception): raise result raise gen.Return(result)
def post(self, *args, **kwargs): # we will use plagiarism system permission import probCrawler self._q = queues.Queue() self.filterOJ = set([]) self.infoDict = {} #email = self.get_argument('email') self.query = probCrawler.crawler(queryName='') #accessKey = self.get_argument('accessKey') from codePlag.models import filePermitArchive # need check right first #if not filePermitArchive.filter(email=email,isAuth=True,isBanned=False,auth_key=accessKey).exists(): # self._reason = '传递的Email值或者准入密钥有问题/您的Email没有验证成功' # self._status_code = 500 # self.write_error(500) # return bulkAccount = self.get_argument('namelist') # split the text according to lines accountList = bulkAccount.split() if len(accountList) > 50: self._reason = '批量查询账号超过最大检索数50' self._status_code = 500 self.write_error(500) # print(accountList) print accountList for account in accountList: self.genTask(name=account) yield self._q.join(timeout=timedelta(seconds=100)) print self.infoDict import tornado.escape self.write(tornado.escape.json_encode(self.infoDict))
def genTask(self, name=''): self._q = queues.Queue() name = self.name import probCrawler self.name = name # traverse non-auth oj rule for oj, website, acRegex, submitRegex in self.query.getNoAuthRules(): success = False otherInfo = 0 # build the URL url = website % name # put into queue yield self._q.put((oj, url)) # ACDream,Codeforces,vjudge,uestc oj = 'uestc' url = 'http://acm.uestc.edu.cn/user/userCenterData/%s' % name yield self._q.put((oj, url)) oj = 'acdream' url = 'http://acdream.info/user/%s' % name yield self._q.put((oj, url)) oj = 'vjudge' url = 'https://cn.vjudge.net/user/solveDetail/%s' % name yield self._q.put((oj, url)) oj = 'codeforces' loopFlag = True loopTimes = 0 count = 1000 startItem = 1 + loopTimes * count url = 'http://codeforces.com/api/user.status?handle=%s&from=%s' % ( name, startItem) yield self._q.put((oj, url)) for _ in range(100): # open up 100 process self.worker()
async def main(): seen_set = set() q = queues.Queue() async def fetch_url(current_url): # 生产者 if current_url in seen_set: return print(f"获取: {current_url}") seen_set.add(current_url) next_urls = await get_url_links(current_url) for new_url in next_urls: if new_url.startswith(base_url): # 放入队列, 使用await 是为了当这个协程放不进去的时候切换出来,切换到另一个get()协程上 await q.put(new_url) async def worker(): async for url in q: if url is None: return try: await fetch_url(url) except Exception as e: print("Exception") finally: # 将数据减一 消费了一个 q.task_done() await q.put(base_url) workers = gen.multi([worker() for _ in range(concurrency)]) await q.join() for _ in range(concurrency): await q.put(None) await workers
def __init__(self, stream, server): self.name = None self.is_alive = True self.is_ready = False self.stream = stream self.message_queue = queues.Queue() self.server = server self.inital_data = None self.inital_data_loaded = locks.Condition() self.handlers = { (message_type.CHAT, chat_message.BROADCAST): self.broadcast_chat, (message_type.LOBBY, lobby_message.READY): self.ready, (message_type.LOBBY, lobby_message.START_GAME): self.load_inital_data, (message_type.LOBBY, lobby_message.UNREADY): self.unready, (message_type.GAME, game_message.READY): self.game_ready, (message_type.GAME, game_message.UPDATE): self.send_to_game((message_type.GAME, game_message.UPDATE)), (message_type.GAME, game_message.CALL): self.send_to_game((message_type.GAME, game_message.CALL)), (message_type.GAME, game_message.ACTION_APPEND): self.send_to_game((message_type.GAME, game_message.ACTION_APPEND)), (message_type.GAME, game_message.ACTION_REMOVE): self.send_to_game((message_type.GAME, game_message.ACTION_REMOVE)), }
def readone(comm): """ Read one message at a time from a comm that reads lists of messages. """ try: q = _readone_queues[comm] except KeyError: q = _readone_queues[comm] = queues.Queue() @gen.coroutine def background_read(): while True: try: messages = yield comm.read() except CommClosedError: break for msg in messages: q.put_nowait(msg) q.put_nowait(None) del _readone_queues[comm] background_read() msg = yield q.get() if msg is None: raise CommClosedError else: raise gen.Return(msg)
def main(): q = queues.Queue() start = time.time() fetching, fetched = set(), set() @gen.coroutine def fetch_url(): current_url = yield q.get() try: if current_url in fetching: return print('fetching {}'.format(current_url)) fetching.add(current_url) urls = yield get_link_from_url(current_url) fetched.add(current_url) for new_url in urls: if new_url.startswith(BASE_URL) and new_url not in fetched: yield q.put(new_url) finally: # 减少计数次数 q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() q.put(BASE_URL) for _ in range(concurrency): worker() yield q.join(timeout=timedelta(seconds=300)) assert fetching == fetched print('Done in {} seconds fetched {} urls'.format(time.time() - start, len(fetched)))
def test_blocking_get_wait(self): q = queues.Queue() # type: queues.Queue[int] q.put(0) self.io_loop.call_later(0.01, q.put, 1) self.io_loop.call_later(0.02, q.put, 2) self.assertEqual(0, (yield q.get(timeout=timedelta(seconds=1)))) self.assertEqual(1, (yield q.get(timeout=timedelta(seconds=1))))
def __init__(self, urls, concurrency): urls.reverse() self.urls = urls self.concurrency = concurrency self._q = queues.Queue() self._fetching = set() self._fetched = set()
def test_put_timeout_preempted(self): q = queues.Queue(1) # type: queues.Queue[int] q.put_nowait(0) put = q.put(1, timeout=timedelta(seconds=0.01)) q.get() yield gen.sleep(0.02) yield put # No TimeoutError.
def test_order(self): q = queues.Queue() # type: queues.Queue[int] for i in [1, 3, 2]: q.put_nowait(i) items = [q.get_nowait() for _ in range(3)] self.assertEqual([1, 3, 2], items)
def __init__(self, endpoints=LOCATOR_DEFAULT_ENDPOINTS, io_loop=None): self.io_loop = io_loop or IOLoop.current() self.endpoints = endpoints self._lock = Lock() self.counter = itertools.count(1) self.pipe = None self.target = Defaults.app self.verbosity = DEBUG_LEVEL self.queue = queues.Queue(10000) # level could be reset from update_verbosity in the future if not fallback_logger.handlers: sh = logging.StreamHandler() sh.setFormatter( logging.Formatter( fmt= "[%(asctime)s.%(msecs)d] %(levelname)s fallback %(message)s", datefmt="%z %d/%b/%Y:%H:%M:%S")) sh.setLevel(logging.DEBUG) fallback_logger.addHandler(sh) self._send() try: uuid = Defaults.uuid self._defaultattrs = [("uuid", uuid)] except GetOptError: self._defaultattrs = []
def __procon(self): """ producer consumer concurrency(parallelism) """ start = time.time() q = queues.Queue(maxsize=self.concurrency) @gen.coroutine def worker(): while True: d = yield q.get() try: yield self.consumer(d) except Exception as e: print("PRO_CON:" + str(e), d) q.task_done() # consumer worker for i in range(self.concurrency): worker() # producer for d in self.producer(): yield q.put(d) yield q.join(timeout=timedelta(seconds=300)) print("concurrency task done in %d seconds" % (time.time() - start))
def __init__(self, name, countries, session=new_session()): """ 创建一个search worker :param name: worker的名称 :param countries: search_worker需要搜索的国家,接受一个列表 """ super(SearchWorker, self).__init__() self.name = name self.queue = queues.Queue() self.countries = countries self.current_searching_countries = [] self.skip = 0 self.cookies = None self.session = session self.client = httpclient.AsyncHTTPClient() self.search_done = False self.cookies_update_lock = locks.Lock() self.cookies_updating = False self.countries_cache = {} self.total_index = 0 workers = [] # 平均下来一个页面中会返回10个详情,我们创建10个detail worker能够保持队列大概持平 for i in range(10): workers.append(DetailWorker(self, i, self.session)) self.workers = workers
def get(self): print("plain opened") self.job = None try: job_id = self.request.query_arguments["job"][0] except (KeyError, IndexError): self.write(b"no job id given\n") return job_id = job_id.decode(errors='replace') try: self.job = jobs.get_existing(job_id) except ValueError: self.write(("no such job: " + repr(job_id) + "\n").encode()) return else: self.queue = queues.Queue() self.job.watch(self) while True: update = yield self.queue.get() if update is StopIteration: return if isinstance(update, StdOut): self.write(update.data.encode()) self.flush()
def __init__( self, sessionstore: "SessionManager", sid: uuid.UUID, hang_interval: int, timout: int, tid: uuid.UUID, endpoint_names: Set[str], nodename: str, disable_expire_check: bool = False, ) -> None: self._sid = sid self._interval = hang_interval self._timeout = timout self._sessionstore: SessionManager = sessionstore self._seen: float = time.time() self._callhandle = None self.expired: bool = False self.tid: uuid.UUID = tid self.endpoint_names: Set[str] = endpoint_names self.nodename: str = nodename self._replies: Dict[uuid.UUID, asyncio.Future] = {} # Disable expiry in certain tests if not disable_expire_check: self.check_expire() self._queue: queues.Queue[common.Request] = queues.Queue() self.client = ReturnClient(str(sid), self)
def main(): q = queues.Queue() start = time.time() fetching, fetched, fetched_page_set = set(), set(), set() @gen.coroutine def fetch_url(): current_url_type, current_url = yield q.get() try: if current_url in fetching: return if current_url_type == 0 and get_page( current_url) not in fetched_page_set: fetching.add(current_url) logging.debug('fetching {0}'.format(current_url)) urls = yield get_reviewers(current_url_type, current_url) fetched.add(current_url) fetched_page_set.add(get_page(current_url)) for (url_type, url) in urls: if (url in fetched) or (url_type == 0 and get_page(url) in fetched_page_set): continue yield q.put((url_type, url)) elif current_url_type == 1 and current_url not in fetched: fetching.add(current_url) logging.debug('fetching {0}'.format(current_url)) yield get_contact(current_url) fetched.add(current_url) except Exception, e: logging.error('error: {0}, {1}'.format(e.message, current_url)) q.put((current_url_type, current_url)) fetching.remove(current_url) finally:
def check_instances(self): """ 把待监控的instance取出来放入队列, 然后消费队列里的instance,即对每个instance进行告警检查 """ q = queues.Queue() monitoring, monitored = set(), set() instances = yield get_instances() map(q.put, instances) @gen.coroutine def _judge_and_alert(): current_instance = yield q.get() try: if current_instance in monitoring: return monitoring.add(current_instance) yield judge_and_alert(current_instance) monitored.add(current_instance) logging.info("{0} was checked!".format(current_instance)) finally: q.task_done() @gen.coroutine def _worker(): while True: yield _judge_and_alert() for _ in range(ALERT_CONCURRENCY_NUM): _worker() # 对所有实例的检查必须在一轮检查周期内完成 yield q.join(timeout=timedelta(seconds=ALERT_CHECK_CYCLE)) assert monitoring == monitored logging.info("Current monitoring cycle is done!")
def test_blocking_put_wait(self): q = queues.Queue(1) # type: queues.Queue[int] q.put_nowait(0) self.io_loop.call_later(0.01, q.get) self.io_loop.call_later(0.02, q.get) futures = [q.put(0), q.put(1)] self.assertFalse(any(f.done() for f in futures)) yield futures
def test_put_with_getters(self): q = queues.Queue() # type: queues.Queue[int] get0 = q.get() get1 = q.get() yield q.put(0) self.assertEqual(0, (yield get0)) yield q.put(1) self.assertEqual(1, (yield get1))
def __init__(self, config, port=None): super().__init__(config, port) self.port = port if port is not None else self.config.server.port self.writing_queue = queues.Queue() self.callbacks = [ queue_writer.QueueWriter(self.writing_queue, self.db).process ] self.path = home.HomeHandler.PATH
def __init__(self, config, port=8889): super().__init__(config, port) self.port = port if port is not None else self.config.messaging.port self.sender = sms_sender.get(self.config) self.queue = queues.Queue() self.scheduler = scheduler.MessageScheduler( config=self.config, db=self.db_factory.create(), queue=self.queue) self.callbacks = [self.process]