async def crawl(self): timeout = aiohttp.ClientTimeout(total=60) self.session = aiohttp.ClientSession(timeout=timeout, loop=loop) await self.q_url.put(root_url) page_first = await self.session.get(root_url) await self.q_text.put((BeautifulSoup(await page_first.text(), 'lxml'), root_url)) async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"), worker_co=self.worker_crawl) as pool: await pool.push() async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"), worker_co=self.worker_elastic) as pool2: await pool2.push() work = asyncio.ensure_future(self.rps_control()) await self.q_rps.join() await self.q_url.join() await self.q_text.join() for p in pool2._workers: p.cancel() work.cancel() await pool2.join() for p in pool._workers: p.cancel() await pool.join() await self.session.close() await self.es.close()
async def main(self): async with Elasticsearch([{'host': 'localhost', 'port': 9200}]) as es: await self.initialize_index(es) await self.links.put(self.start_url) async with aiohttp.ClientSession() as session: async with asyncpool.AsyncPool( self.loop, num_workers=10, name="CrawlerPool", logger=logging.getLogger("CrawlerPool"), worker_co=self.worker) as pool: link = await self.links.get() await pool.push(link, es, session) while True: if not self.links.empty(): link = await self.links.get() else: await asyncio.sleep(0.2) if self.links.empty(): break link = await self.links.get() await asyncio.sleep(self.sleep_time) await pool.push(link=link, es=es, session=session)
async def run(telegram_bot, bot_checker, bots, stop_event: threading.Event = None) -> Counter: result_queue = asyncio.Queue() loop = bot_checker.event_loop reader_future = asyncio.ensure_future(result_reader(result_queue), loop=loop) # TODO: check correct order of bots concerning pings etc. async with asyncpool.AsyncPool( loop, num_workers=settings.BOTCHECKER_CONCURRENT_COUNT, name="BotChecker", logger=log, worker_co=check_bot, max_task_time=300, log_every_n=settings.BOTCHECKER_CONCURRENT_COUNT, expected_total=len(bots), ) as pool: for to_check in bots: # TODO: implement properly if stop_event and stop_event.is_set(): print('JOINING') pool.join() else: await pool.push(telegram_bot, bot_checker, to_check, result_queue) await result_queue.put(None) return await reader_future
async def crawl(self): timeout = aiohttp.ClientTimeout(total=60) self.session = aiohttp.ClientSession(timeout=timeout, loop=loop) async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"), worker_co=self.worker_crawl) as pool: await pool.push() async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"), worker_co=self.worker_elastic) as pool2: await pool2.push() while True: self.root_url, self.user_id = await self.q_root_url.get() await self.q_url.put(self.root_url) work = await asyncio.ensure_future(self.rps_control()) await self.q_rps.join() await self.q_url.join() await self.q_text.join() work.cancel() stats = CrawlerStats.objects.get(domain=self.root_url) stats.time = str(datetime.now()) stats.save()
async def main(self): if not await self.initialize_index(es): return await self.links.put(self.start_url) async with aiohttp.ClientSession() as session: async with asyncpool.AsyncPool( self.loop, num_workers=10, name="CrawlerPool", logger=logging.getLogger("CrawlerPool"), worker_co=self.worker) as pool: t_begin = time() link = await self.links.get() await pool.push(link, es, session) self.time_statistic.append(time() - t_begin) while True: if self.stop_signal: break time_for_link = time() if not self.links.empty(): link = await self.links.get() else: wait_time = time() while self.links.empty( ) and self.tmp_id < self.max_count: await asyncio.sleep(0.1) if time() - wait_time > 2: logger.info(f'break at {self.start_url} after ' f'2 seconds waiting') break if self.links.empty(): break link = await self.links.get() await asyncio.sleep(self.sleep_time) await pool.push(link=link, es=es, session=session) self.time_statistic.append(time() - time_for_link) return { 'pages': self.tmp_id, 'avg_time_per_page': sum(self.time_statistic) / self.tmp_id, 'max_time_per_page': max(self.time_statistic), 'min_time_per_page': min(self.time_statistic) }
async def _make_post_api_call( self, urls_params: list, loop: AbstractEventLoop, num_workers: int = 10, ): async with asyncpool.AsyncPool( loop, num_workers=num_workers, name="dfsre-reinstate-pool", logger=self.log, worker_co=self._async_post_api_call) as pool: for i, urls_param in enumerate(urls_params): await pool.push(i, **urls_param) return self.post_results
async def fill_events(_loop, number_per_day, bulk_size): async with asyncpool.AsyncPool(_loop, num_workers=inserter_config.WORKERS, worker_co=write_to_event, max_task_time=300, log_every_n=10, name="CHPool", logger=logging.getLogger("CHPool")) as p: insert_time = datetime.datetime(2018, 1, 1) for i in range(365): for _ in range(int(number_per_day / bulk_size)): events = generate_random_events(insert_time, bulk_size) await p.push(events, None) insert_time = insert_time + datetime.timedelta(days=1)
async def run(self): asyncio.ensure_future( self.feed.start(self.settings.FEED_TOPIC, self.feed_queue)) asyncio.ensure_future( self.stream.start(self.settings.STREAM_TOPIC, self.stream_queue)) async with asyncpool.AsyncPool( loop, num_workers=self.settings.AMOUNT_OF_WORKERS, name="workers_pool", logger=logging.getLogger("AioETL"), worker_co=await object_from_settings(settings.WORKER_TYPE, self.feed_queue, self.stream_queue, self.settings).start(), max_task_time=300, # log_every_n=10 ) as pool: for i in self.feed_queue: await pool.push(i, self.stream_queue)
async def get_pokemon_id_list(self, id_list): """ in: list with ids [1, 100, 120, 90, ....] return: pokemons instances with this ids warn: return not sorted list """ result_queue = asyncio.Queue() reader_future = asyncio.ensure_future(self.result_reader_id_list(result_queue), loop=asyncio.get_running_loop()) async with asyncpool.AsyncPool(asyncio.get_running_loop(), num_workers=len(id_list)+1, name="GetPokemonListPool", logger=logging.getLogger("PokemonListIdPool"), worker_co=self._get_pokemon_id, max_task_time=config.pool_task_time, log_every_n=10) as pool: for i in id_list: await pool.push(i, result_queue) await result_queue.put(None) return await reader_future
async def get_pokemon_list(self, start_id): """ in: start_id(where from to start offset): 1 return: list of pokemons instances start_id + pok_per_page: 1+6 so it be pokemons with id: [1, 2, 3, 4, 5, 6] sorted """ result_queue = asyncio.Queue() reader_future = asyncio.ensure_future(self.result_reader_list(result_queue, start_id), loop=asyncio.get_running_loop()) async with asyncpool.AsyncPool(asyncio.get_running_loop(), num_workers=config.pokemons_per_page, name="GetPokemonListPool", logger=logging.getLogger("PokemonListPool"), worker_co=self._get_pokemon_id, max_task_time=config.pool_task_time, log_every_n=10) as pool: for i in range(start_id, start_id + config.pokemons_per_page, 1): await pool.push(i, result_queue) await result_queue.put(None) return await reader_future
async def run(): result_queue = asyncio.Queue() reader_future = asyncio.ensure_future(result_reader(result_queue), loop=loop) # Start a worker pool with 10 coroutines, invokes `example_coro` and waits for it to complete or 5 minutes to pass. async with asyncpool.AsyncPool(loop, num_workers=10, name="ExamplePool", logger=logging.getLogger("ExamplePool"), worker_co=example_coro, max_task_time=300, log_every_n=10) as pool: for i in range(50): await pool.push(i, result_queue) await result_queue.put(None) await reader_future
async def get_stats(self, root_path: str, max_image_names: int = None): async with asyncpool.AsyncPool(None, 100, 'blob_pool', self._logger, self._process_blob, raise_on_join=True, log_every_n=1000) as blob_pool, \ asyncpool.AsyncPool(None, 100, 'img_pool', self._logger, self._process_image_tag, raise_on_join=True, log_every_n=1000) as image_tag_pool: image_num = 0 async for image_name in self._client.catalog_pager(): num_tags = 0 async for tag in self._client.image_tag_pager(image_name): num_tags += 1 await image_tag_pool.push(blob_pool, image_name, tag) image_num += 1 self._logger.info(f"{image_name} pushed num tags: {num_tags}") if max_image_names is not None and image_num == max_image_names: break description = [ ('Group Name', 'string'), ('Parent', 'string'), ('Size (size)', 'number'), ] data_dict = dict(root=[]) # This needs to be done from parent to child g_instances = _BlobGroupInstanceHelper() for image_name, blob_groups in self._image_info.items(): image_unique_size = 0 img_data = data_dict[image_name] = [] for blob_group_key, tags in blob_groups.items(): blob_group_unique_size, parent_blob_group_key = self._get_blob_group_info( blob_group_key) image_unique_size += blob_group_unique_size orig_blob_group_name = blob_group_name = g_instances.new_instance( self._get_blob_group_name(blob_group_key)) while parent_blob_group_key: parent_blob_group_name = g_instances.new_instance( self._get_blob_group_name(parent_blob_group_key)) blob_group_unique_size, parent_blob_group_key = self._get_blob_group_info( parent_blob_group_key) img_data.append((parent_blob_group_name, blob_group_name, blob_group_unique_size)) blob_group_name = parent_blob_group_name # break # TODO: find better way to show large multi-level trees img_data.append( (orig_blob_group_name, "root", blob_group_unique_size )) # unfortunately you can't have two nodes point to this data_dict["root"].append((image_name, 'root', image_unique_size)) img_data.append(("root", None, image_unique_size)) data_dict["root"].append(("root", None, self._total_blob_size)) self._logger.info( f"Total num blobs: {len(self._blob_to_image_tags)} size: {self._total_blob_size:,}" ) get_treemap(description, data_dict, root_path)