Exemple #1
0
 def output_stats(self):
     logger.debug('Dumping Aiocrawler stats:')
     for key in self.get_collect_keys():
         print('{classname}: "{key}": {value}'.format(
             classname=self.__class__.__name__,
             key=key,
             value=getattr(self, key)))
Exemple #2
0
    async def _main(self):
        await self.__initialize()

        tasks = []
        for target, args, kwargs in self.__startup_tasks:
            tasks.append(asyncio.ensure_future(self.__run_task(target(*args, **kwargs))))

        for _ in range(self._settings.CONCURRENT_WORDS):
            tasks.append(asyncio.ensure_future(self.__handle_scheduler_word()))

        for _ in range(self._settings.CONCURRENT_REQUESTS):
            tasks.append(asyncio.ensure_future(self.__handle_scheduler_request()))

        await self.__job_scheduler.spawn(self.__run_task(self.__collector.collect_start(
            self._spider.__class__.__name__, self._settings.DATETIME_FORMAT
        )))

        await asyncio.wait(tasks)

        tasks = []
        for target, args, kwargs in self.__cleanup_tasks:
            tasks.append(asyncio.ensure_future(self.__run_task(target(*args, **kwargs))))

        if len(tasks):
            await asyncio.wait(tasks)

        # collect finished information
        await self.__run_task(self.__collector.collect_finish(self._settings.DATETIME_FORMAT))
        await self.__run_task(self.__collector.output_stats())

        logger.debug('The Crawler is closed. <Reason {reason}>', reason=self.__collector.finish_reason)
Exemple #3
0
    def __shutdown_signal(self, _, __):
        self.__signal_int_count += 1

        if self.__signal_int_count == 1:
            logger.debug('Received SIGNAL INT, shutting down gracefully. Send again to force')
            self.close_crawler('Received SIGNAL INT')
        else:
            self.close_crawler('Received SIGNAL INT', force=True)
            logger.debug('Received SIGNAL INT Over 2 times, shutting down the Crawler by force...')
Exemple #4
0
 def on_epoch_end(self, _, logs=None):
     current_loss = logs.get('loss', np.inf)
     if current_loss < self._best:
         if self._only_save_weight:
             for model, model_path in self._models:
                 model.save_weights(str(model_path), overwrite=True)
         else:
             for model, model_path in self._models:
                 model.save(str(model_path), overwrite=True)
         logger.debug('Model saved')
Exemple #5
0
    async def __initialize(self):
        """
        Initialize all necessary components.
        """
        logger.debug('Initializing...')

        if not self.__downloader:
            from aiocrawler.downloaders.aio_downloader import AioDownloader
            from aiohttp import ClientSession, CookieJar
            session = ClientSession(cookie_jar=CookieJar(unsafe=True))
            self.__downloader = AioDownloader(self._settings, session)
            self.on_cleanup(session.close)

        if not self._scheduler:
            if self._settings.REDIS_URL and not self._settings.DISABLE_REDIS:
                from aiocrawler.schedulers import RedisScheduler
                self._scheduler = RedisScheduler(settings=self._settings)
                await self._scheduler.create_redis_pool()

                if not self._filters:
                    from aiocrawler.filters import RedisFilter
                    self._filters = RedisFilter(self._settings, redis_pool=self._scheduler.redis_pool)

                self.on_cleanup(self._scheduler.close_redis_pool)

            else:
                from aiocrawler.schedulers import MemoryScheduler
                self._scheduler = MemoryScheduler(self._settings, self._spider)

        if not self._filters:
            if self._settings.REDIS_URL and not self._settings.DISABLE_REDIS:
                from aiocrawler.filters import RedisFilter
                self._filters = RedisFilter(self._settings, redis_pool=self._scheduler.redis_pool)
                await self._filters.create_redis_pool()
                self.on_cleanup(self._filters.close_redis_pool)

            else:
                from aiocrawler.filters import MemoryFilter
                self._filters = MemoryFilter(self._settings)

        from aiocrawler import middlewares

        for mw_name, key in self._settings.DEFAULT_MIDDLEWARES.items():
            if 0 <= key <= 1000 and mw_name in middlewares.__all__:
                self.__middlewares.append((getattr(middlewares, mw_name), key))

        for mw, key in self._settings.MIDDLEWARES:
            if 0 <= key <= 1000 and issubclass(middlewares.BaseMiddleware, mw):
                self.__middlewares.append((mw, key))
        self.__middlewares = sorted(self.__middlewares, key=lambda x: x[1])
        self.__middlewares = list(map(lambda x: x[0](self._settings, self), self.__middlewares))

        logger.debug('Initialized')
Exemple #6
0
    async def __handle_scheduler_word(self):
        """
        Handle the word from the scheduler.
        """
        while not self.__shutting_down:
            await asyncio.sleep(self._settings.PROCESS_DALEY)
            word = await self.__run_task(self._scheduler.get_word())
            if word:
                await self.__job_scheduler.spawn(self.__run_task(self.__collector.collect_word()))

                logger.debug(
                    'Making Request from word <word: {word}>'.format(word=word))
                request = self._spider.make_request(word)
                if request:
                    await self.__run_task(self._scheduler.send_request(request))
Exemple #7
0
    def on_epoch_end(self, epoch, logs=None):
        acc_mean = 0
        for ix in range(self._generator.__len__()):
            [x, y_data, _, _], _ = self._generator.__getitem__(ix)
            y_pred = self._base_model.predict(x)
            shape = y_pred.shape
            decode = backend.ctc_decode(y_pred,
                                        input_length=np.ones(shape[0]) *
                                        shape[1])[0][0]
            out = backend.get_value(decode)
            acc = 0
            for i, y in enumerate(y_data):
                y = np.array([idx for idx in y if idx != 0])
                pred = out[i][:len(y)]
                if all(pred == y):
                    acc += 1 / len(y_data)
            acc_mean += acc / self._generator.__len__()

        logger.debug('acc: %0.4f%%' % acc_mean)
Exemple #8
0
    async def receive(self, websocket: web.WebSocketResponse, uuid: str):
        async for msg in websocket:
            # noinspection PyBroadException
            try:
                data = loads(msg.data)
                logger.debug('from {uuid}: {data}'.format(uuid=uuid,
                                                          data=data))

                if data['classname'] == 'collector':
                    # await self.__job_scheduler.spawn(self.__handle_client_collector(data))
                    pass
                elif data['classname'] == 'client':
                    await self.__job_scheduler.spawn(
                        self.__handler_client(data, uuid))
                    # await self.__job_scheduler.spawn(self.__handle_websocket_client(data))
                elif data['classname'] == 'Monitor':
                    await self.__job_scheduler.spawn(
                        self.__handle_monitor(data))
            except Exception:
                pass