Beispiel #1
0
    async def crawl(self):
        timeout = aiohttp.ClientTimeout(total=60)
        self.session = aiohttp.ClientSession(timeout=timeout, loop=loop)
        await self.q_url.put(root_url)
        page_first = await self.session.get(root_url)
        await self.q_text.put((BeautifulSoup(await page_first.text(), 'lxml'), root_url))

        async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"),
                                       worker_co=self.worker_crawl) as pool:
            await pool.push()
            async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"),
                                           worker_co=self.worker_elastic) as pool2:
                await pool2.push()
                work = asyncio.ensure_future(self.rps_control())
                await self.q_rps.join()
                await self.q_url.join()
                await self.q_text.join()
                for p in pool2._workers:
                    p.cancel()
                work.cancel()
                await pool2.join()
            for p in pool._workers:
                p.cancel()
            await pool.join()
        await self.session.close()
        await self.es.close()
Beispiel #2
0
    async def main(self):
        async with Elasticsearch([{'host': 'localhost', 'port': 9200}]) as es:
            await self.initialize_index(es)
            await self.links.put(self.start_url)

            async with aiohttp.ClientSession() as session:
                async with asyncpool.AsyncPool(
                        self.loop,
                        num_workers=10,
                        name="CrawlerPool",
                        logger=logging.getLogger("CrawlerPool"),
                        worker_co=self.worker) as pool:
                    link = await self.links.get()
                    await pool.push(link, es, session)

                    while True:
                        if not self.links.empty():
                            link = await self.links.get()
                        else:
                            await asyncio.sleep(0.2)
                            if self.links.empty():
                                break

                            link = await self.links.get()

                        await asyncio.sleep(self.sleep_time)
                        await pool.push(link=link, es=es, session=session)
Beispiel #3
0
async def run(telegram_bot, bot_checker, bots, stop_event: threading.Event = None) -> Counter:
    result_queue = asyncio.Queue()
    loop = bot_checker.event_loop
    reader_future = asyncio.ensure_future(result_reader(result_queue), loop=loop)

    # TODO: check correct order of bots concerning pings etc.

    async with asyncpool.AsyncPool(
            loop,
            num_workers=settings.BOTCHECKER_CONCURRENT_COUNT,
            name="BotChecker",
            logger=log,
            worker_co=check_bot,
            max_task_time=300,
            log_every_n=settings.BOTCHECKER_CONCURRENT_COUNT,
            expected_total=len(bots),
    ) as pool:
        for to_check in bots:
            # TODO: implement properly
            if stop_event and stop_event.is_set():
                print('JOINING')
                pool.join()
            else:
                await pool.push(telegram_bot, bot_checker, to_check, result_queue)

    await result_queue.put(None)
    return await reader_future
Beispiel #4
0
    async def crawl(self):
        timeout = aiohttp.ClientTimeout(total=60)
        self.session = aiohttp.ClientSession(timeout=timeout, loop=loop)

        async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"),
                                       worker_co=self.worker_crawl) as pool:
            await pool.push()
            async with asyncpool.AsyncPool(loop, num_workers=10, name="workers", logger=logging.getLogger("Workers"),
                                           worker_co=self.worker_elastic) as pool2:
                await pool2.push()
                while True:
                    self.root_url, self.user_id = await self.q_root_url.get()
                    await self.q_url.put(self.root_url)
                    work = await asyncio.ensure_future(self.rps_control())
                    await self.q_rps.join()
                    await self.q_url.join()
                    await self.q_text.join()
                    work.cancel()
                    stats = CrawlerStats.objects.get(domain=self.root_url)
                    stats.time = str(datetime.now())
                    stats.save()
Beispiel #5
0
    async def main(self):
        if not await self.initialize_index(es):
            return

        await self.links.put(self.start_url)

        async with aiohttp.ClientSession() as session:
            async with asyncpool.AsyncPool(
                    self.loop,
                    num_workers=10,
                    name="CrawlerPool",
                    logger=logging.getLogger("CrawlerPool"),
                    worker_co=self.worker) as pool:
                t_begin = time()
                link = await self.links.get()
                await pool.push(link, es, session)
                self.time_statistic.append(time() - t_begin)

                while True:
                    if self.stop_signal:
                        break

                    time_for_link = time()
                    if not self.links.empty():
                        link = await self.links.get()
                    else:
                        wait_time = time()
                        while self.links.empty(
                        ) and self.tmp_id < self.max_count:
                            await asyncio.sleep(0.1)

                            if time() - wait_time > 2:
                                logger.info(f'break at {self.start_url} after '
                                            f'2 seconds waiting')
                                break

                        if self.links.empty():
                            break

                        link = await self.links.get()

                    await asyncio.sleep(self.sleep_time)
                    await pool.push(link=link, es=es, session=session)
                    self.time_statistic.append(time() - time_for_link)
        return {
            'pages': self.tmp_id,
            'avg_time_per_page': sum(self.time_statistic) / self.tmp_id,
            'max_time_per_page': max(self.time_statistic),
            'min_time_per_page': min(self.time_statistic)
        }
Beispiel #6
0
 async def _make_post_api_call(
     self,
     urls_params: list,
     loop: AbstractEventLoop,
     num_workers: int = 10,
 ):
     async with asyncpool.AsyncPool(
             loop,
             num_workers=num_workers,
             name="dfsre-reinstate-pool",
             logger=self.log,
             worker_co=self._async_post_api_call) as pool:
         for i, urls_param in enumerate(urls_params):
             await pool.push(i, **urls_param)
     return self.post_results
async def fill_events(_loop, number_per_day, bulk_size):
    async with asyncpool.AsyncPool(_loop,
                                   num_workers=inserter_config.WORKERS,
                                   worker_co=write_to_event,
                                   max_task_time=300,
                                   log_every_n=10,
                                   name="CHPool",
                                   logger=logging.getLogger("CHPool")) as p:

        insert_time = datetime.datetime(2018, 1, 1)
        for i in range(365):
            for _ in range(int(number_per_day / bulk_size)):
                events = generate_random_events(insert_time, bulk_size)
                await p.push(events, None)

            insert_time = insert_time + datetime.timedelta(days=1)
Beispiel #8
0
 async def run(self):
     asyncio.ensure_future(
         self.feed.start(self.settings.FEED_TOPIC, self.feed_queue))
     asyncio.ensure_future(
         self.stream.start(self.settings.STREAM_TOPIC, self.stream_queue))
     async with asyncpool.AsyncPool(
             loop,
             num_workers=self.settings.AMOUNT_OF_WORKERS,
             name="workers_pool",
             logger=logging.getLogger("AioETL"),
             worker_co=await
             object_from_settings(settings.WORKER_TYPE, self.feed_queue,
                                  self.stream_queue, self.settings).start(),
             max_task_time=300,
             # log_every_n=10
     ) as pool:
         for i in self.feed_queue:
             await pool.push(i, self.stream_queue)
    async def get_pokemon_id_list(self, id_list):
        """
            in: list with ids [1, 100, 120, 90, ....]
            return: pokemons instances with this ids
            warn: return not sorted list
        """
        result_queue = asyncio.Queue()
        reader_future = asyncio.ensure_future(self.result_reader_id_list(result_queue), loop=asyncio.get_running_loop())

        async with asyncpool.AsyncPool(asyncio.get_running_loop(), num_workers=len(id_list)+1, name="GetPokemonListPool",
                                logger=logging.getLogger("PokemonListIdPool"),
                                worker_co=self._get_pokemon_id, max_task_time=config.pool_task_time,
                                log_every_n=10) as pool:
            for i in id_list:
                await pool.push(i, result_queue)

        await result_queue.put(None)
        return await reader_future
    async def get_pokemon_list(self, start_id):
        """
            in: start_id(where from to start offset): 1
            return: list of pokemons instances start_id + pok_per_page: 1+6
            so it be pokemons with id: [1, 2, 3, 4, 5, 6]
            sorted
        """
        result_queue = asyncio.Queue()
        reader_future = asyncio.ensure_future(self.result_reader_list(result_queue, start_id), loop=asyncio.get_running_loop())

        async with asyncpool.AsyncPool(asyncio.get_running_loop(), num_workers=config.pokemons_per_page, name="GetPokemonListPool",
                                logger=logging.getLogger("PokemonListPool"),
                                worker_co=self._get_pokemon_id, max_task_time=config.pool_task_time,
                                log_every_n=10) as pool:
            for i in range(start_id, start_id + config.pokemons_per_page, 1):
                await pool.push(i, result_queue)

        await result_queue.put(None)
        return await reader_future
Beispiel #11
0
async def run():

    result_queue = asyncio.Queue()

    reader_future = asyncio.ensure_future(result_reader(result_queue),
                                          loop=loop)

    # Start a worker pool with 10 coroutines, invokes `example_coro` and waits for it to complete or 5 minutes to pass.
    async with asyncpool.AsyncPool(loop,
                                   num_workers=10,
                                   name="ExamplePool",
                                   logger=logging.getLogger("ExamplePool"),
                                   worker_co=example_coro,
                                   max_task_time=300,
                                   log_every_n=10) as pool:
        for i in range(50):
            await pool.push(i, result_queue)

    await result_queue.put(None)
    await reader_future
Beispiel #12
0
    async def get_stats(self, root_path: str, max_image_names: int = None):
        async with asyncpool.AsyncPool(None, 100, 'blob_pool', self._logger, self._process_blob, raise_on_join=True, log_every_n=1000) as blob_pool, \
                asyncpool.AsyncPool(None, 100, 'img_pool', self._logger, self._process_image_tag, raise_on_join=True, log_every_n=1000) as image_tag_pool:
            image_num = 0
            async for image_name in self._client.catalog_pager():
                num_tags = 0
                async for tag in self._client.image_tag_pager(image_name):
                    num_tags += 1
                    await image_tag_pool.push(blob_pool, image_name, tag)

                image_num += 1
                self._logger.info(f"{image_name} pushed num tags: {num_tags}")
                if max_image_names is not None and image_num == max_image_names:
                    break

        description = [
            ('Group Name', 'string'),
            ('Parent', 'string'),
            ('Size (size)', 'number'),
        ]

        data_dict = dict(root=[])

        # This needs to be done from parent to child
        g_instances = _BlobGroupInstanceHelper()

        for image_name, blob_groups in self._image_info.items():
            image_unique_size = 0

            img_data = data_dict[image_name] = []

            for blob_group_key, tags in blob_groups.items():
                blob_group_unique_size, parent_blob_group_key = self._get_blob_group_info(
                    blob_group_key)
                image_unique_size += blob_group_unique_size
                orig_blob_group_name = blob_group_name = g_instances.new_instance(
                    self._get_blob_group_name(blob_group_key))

                while parent_blob_group_key:
                    parent_blob_group_name = g_instances.new_instance(
                        self._get_blob_group_name(parent_blob_group_key))
                    blob_group_unique_size, parent_blob_group_key = self._get_blob_group_info(
                        parent_blob_group_key)

                    img_data.append((parent_blob_group_name, blob_group_name,
                                     blob_group_unique_size))
                    blob_group_name = parent_blob_group_name
                    # break  # TODO: find better way to show large multi-level trees

                img_data.append(
                    (orig_blob_group_name, "root", blob_group_unique_size
                     ))  # unfortunately you can't have two nodes point to this

            data_dict["root"].append((image_name, 'root', image_unique_size))
            img_data.append(("root", None, image_unique_size))

        data_dict["root"].append(("root", None, self._total_blob_size))

        self._logger.info(
            f"Total num blobs: {len(self._blob_to_image_tags)} size: {self._total_blob_size:,}"
        )

        get_treemap(description, data_dict, root_path)