def output_stats(self): logger.debug('Dumping Aiocrawler stats:') for key in self.get_collect_keys(): print('{classname}: "{key}": {value}'.format( classname=self.__class__.__name__, key=key, value=getattr(self, key)))
async def _main(self): await self.__initialize() tasks = [] for target, args, kwargs in self.__startup_tasks: tasks.append(asyncio.ensure_future(self.__run_task(target(*args, **kwargs)))) for _ in range(self._settings.CONCURRENT_WORDS): tasks.append(asyncio.ensure_future(self.__handle_scheduler_word())) for _ in range(self._settings.CONCURRENT_REQUESTS): tasks.append(asyncio.ensure_future(self.__handle_scheduler_request())) await self.__job_scheduler.spawn(self.__run_task(self.__collector.collect_start( self._spider.__class__.__name__, self._settings.DATETIME_FORMAT ))) await asyncio.wait(tasks) tasks = [] for target, args, kwargs in self.__cleanup_tasks: tasks.append(asyncio.ensure_future(self.__run_task(target(*args, **kwargs)))) if len(tasks): await asyncio.wait(tasks) # collect finished information await self.__run_task(self.__collector.collect_finish(self._settings.DATETIME_FORMAT)) await self.__run_task(self.__collector.output_stats()) logger.debug('The Crawler is closed. <Reason {reason}>', reason=self.__collector.finish_reason)
def __shutdown_signal(self, _, __): self.__signal_int_count += 1 if self.__signal_int_count == 1: logger.debug('Received SIGNAL INT, shutting down gracefully. Send again to force') self.close_crawler('Received SIGNAL INT') else: self.close_crawler('Received SIGNAL INT', force=True) logger.debug('Received SIGNAL INT Over 2 times, shutting down the Crawler by force...')
def on_epoch_end(self, _, logs=None): current_loss = logs.get('loss', np.inf) if current_loss < self._best: if self._only_save_weight: for model, model_path in self._models: model.save_weights(str(model_path), overwrite=True) else: for model, model_path in self._models: model.save(str(model_path), overwrite=True) logger.debug('Model saved')
async def __initialize(self): """ Initialize all necessary components. """ logger.debug('Initializing...') if not self.__downloader: from aiocrawler.downloaders.aio_downloader import AioDownloader from aiohttp import ClientSession, CookieJar session = ClientSession(cookie_jar=CookieJar(unsafe=True)) self.__downloader = AioDownloader(self._settings, session) self.on_cleanup(session.close) if not self._scheduler: if self._settings.REDIS_URL and not self._settings.DISABLE_REDIS: from aiocrawler.schedulers import RedisScheduler self._scheduler = RedisScheduler(settings=self._settings) await self._scheduler.create_redis_pool() if not self._filters: from aiocrawler.filters import RedisFilter self._filters = RedisFilter(self._settings, redis_pool=self._scheduler.redis_pool) self.on_cleanup(self._scheduler.close_redis_pool) else: from aiocrawler.schedulers import MemoryScheduler self._scheduler = MemoryScheduler(self._settings, self._spider) if not self._filters: if self._settings.REDIS_URL and not self._settings.DISABLE_REDIS: from aiocrawler.filters import RedisFilter self._filters = RedisFilter(self._settings, redis_pool=self._scheduler.redis_pool) await self._filters.create_redis_pool() self.on_cleanup(self._filters.close_redis_pool) else: from aiocrawler.filters import MemoryFilter self._filters = MemoryFilter(self._settings) from aiocrawler import middlewares for mw_name, key in self._settings.DEFAULT_MIDDLEWARES.items(): if 0 <= key <= 1000 and mw_name in middlewares.__all__: self.__middlewares.append((getattr(middlewares, mw_name), key)) for mw, key in self._settings.MIDDLEWARES: if 0 <= key <= 1000 and issubclass(middlewares.BaseMiddleware, mw): self.__middlewares.append((mw, key)) self.__middlewares = sorted(self.__middlewares, key=lambda x: x[1]) self.__middlewares = list(map(lambda x: x[0](self._settings, self), self.__middlewares)) logger.debug('Initialized')
async def __handle_scheduler_word(self): """ Handle the word from the scheduler. """ while not self.__shutting_down: await asyncio.sleep(self._settings.PROCESS_DALEY) word = await self.__run_task(self._scheduler.get_word()) if word: await self.__job_scheduler.spawn(self.__run_task(self.__collector.collect_word())) logger.debug( 'Making Request from word <word: {word}>'.format(word=word)) request = self._spider.make_request(word) if request: await self.__run_task(self._scheduler.send_request(request))
def on_epoch_end(self, epoch, logs=None): acc_mean = 0 for ix in range(self._generator.__len__()): [x, y_data, _, _], _ = self._generator.__getitem__(ix) y_pred = self._base_model.predict(x) shape = y_pred.shape decode = backend.ctc_decode(y_pred, input_length=np.ones(shape[0]) * shape[1])[0][0] out = backend.get_value(decode) acc = 0 for i, y in enumerate(y_data): y = np.array([idx for idx in y if idx != 0]) pred = out[i][:len(y)] if all(pred == y): acc += 1 / len(y_data) acc_mean += acc / self._generator.__len__() logger.debug('acc: %0.4f%%' % acc_mean)
async def receive(self, websocket: web.WebSocketResponse, uuid: str): async for msg in websocket: # noinspection PyBroadException try: data = loads(msg.data) logger.debug('from {uuid}: {data}'.format(uuid=uuid, data=data)) if data['classname'] == 'collector': # await self.__job_scheduler.spawn(self.__handle_client_collector(data)) pass elif data['classname'] == 'client': await self.__job_scheduler.spawn( self.__handler_client(data, uuid)) # await self.__job_scheduler.spawn(self.__handle_websocket_client(data)) elif data['classname'] == 'Monitor': await self.__job_scheduler.spawn( self.__handle_monitor(data)) except Exception: pass