def _close(self): try: self.close() except Exception as e: pretty_error(e, self.logger) finally: self._close_msg()
async def request_finger(self, req): url = req.url try: args = [canonicalize_url(url)] for arg in ('data', 'files', 'auth', 'cert', 'json', 'cookies'): if req.__dict__.get(arg): args.append(req.__dict__.get(arg)) finger = get_md5(*args) except Exception as e: pretty_error(e, self.logger) else: if isinstance(self.setting.request_filter, set): if finger not in self.setting.request_filter: self.setting.request_filter.add(finger) return req else: self.logger.warning("filter: {}".format(req)) elif hasattr(self.setting.request_filter, 'sadd'): if await self.setting.request_filter.sadd( self.setting.request_filter_key, finger): return req else: self.logger.warning("filter: {}".format(req)) else: self.logger.warning('Invalid request filter type: {}'.format( self.setting.request_filter))
async def _downloader(self): """ 请求调度函数 """ try: self.prepare() await self._init_() req_list = [] while not self.stop: req = await self.setting.request_queue.pop() if req: req_list.append(self.async_request(req)) if len( req_list ) >= self.setting.request_batch_size or await self.setting.request_queue.empty( ): # 异步请求 resp_list = await asyncio.gather(*req_list) # 处理响应 await asyncio.gather(*[ self._process_response(resp) for resp in resp_list if resp is not None ]) req_list.clear() if await self.setting.request_queue.empty(): self._stop -= 1 except Exception as e: pretty_error(e, self.logger) finally: if self.setting.aiohttp_clientsession: await self.setting.aiohttp_clientsession.close() if self.setting.redis_msg: await self._write_msg_to_redis() if self.setting.clear_filter: await self._clear_filter() self._close()
async def _write_msg_to_redis(self): try: await self.setting.redis_client.set('Spider:{}'.format(self.name), json.dumps(self.msg)) except Exception as e: if e.__class__.__name__ == 'ConnectionError': logger.warning('Redis 连接失败') else: pretty_error(e, self.logger)
async def _process_middleware(self, resq, middlewares): if not middlewares: return resq try: for mid in middlewares: resq = mid(resq) if isinstance(resq, Coroutine): resq = await resq if not resq: return except Exception as e: pretty_error(e, self.logger) else: return resq
def start(self): try: self.loop.run_until_complete(self._downloader()) except KeyboardInterrupt: self.logger.warning('KeyboardInterrupt') self._close() try: sys.exit(0) except SystemExit: os._exit(0) except Exception as e: pretty_error(e, self.logger) self._close()
async def _process_response(self, resp): # 调用响应中间件 resp = await self._process_middleware( resp, self.setting.response_middlewares) if resp is None: self._msg['response_dropped'] += 1 return else: try: for r in await self._process_callback(resp): await self._process_return(resp.request.callback.__name__, r) except Exception as e: pretty_error(e, self.logger) finally: self._stop -= 1
async def _init_(self): # assert params if callable(self.setting.item_pipelines): self.setting.item_pipelines = [self.setting.item_pipelines] assert isinstance(self.setting.item_pipelines, Iterable), \ 'ITEM_PIPELINE type error: except function or function list, get {}.'.format(self.setting.item_pipelines) for pipe in self.setting.item_pipelines: assert callable(pipe), 'ITEM_PIPELINE({}) not callable'.format( pipe) if self.setting.request_middlewares is not None: if callable(self.setting.request_middlewares): self.setting.request_middlewares = [ self.setting.request_middlewares ] self._check_middlewares(self.setting.request_middlewares) if self.setting.response_middlewares is not None: if callable(self.setting.response_middlewares): self.setting.response_middlewares = [ self.setting.response_middlewares ] self._check_middlewares(self.setting.response_middlewares) # init request queue self._msg['callback_runtime_map'][self.start_requests.__name__] = ( time.time(), 0) try: request_list = self.start_requests() except Exception as e: pretty_error(e, self) else: if not request_list: return if not isinstance(request_list, Iterable): request_list = [request_list] for r in request_list: if not r: continue await self._process_return(self.start_requests.__name__, r) start_time = self._msg['callback_runtime_map'].get( self.start_requests.__name__)[0] end_time = time.time() self._msg['callback_runtime_map'][self.start_requests.__name__] = ( start_time, end_time, human_time(end_time - start_time))
async def _process_callback(self, resp): """ 处理回调函数 """ try: if isinstance(resp, list): result = resp[0].request.callback(resp) else: result = resp.request.callback(resp, *resp.request.cb_args, **resp.request.cb_kwargs) except Exception as e: pretty_error(e, self.logger) return [] else: if isinstance(result, Coroutine): result = await result if not result: return [] if not isgenerator(result): result = [result] return result
def start(self, name=None): spiders = self._get_spider(name=name) if name is None and not spiders: self.logger.error('Spiders map is null') return if not spiders: self.logger.error('Cannot find spider: {}'.format(name)) return try: self.loop.run_until_complete(self._run_spiders(spiders)) except KeyboardInterrupt: self.logger.warning('KeyboardInterrupt') try: sys.exit(0) except SystemExit: os._exit(0) except Exception as e: pretty_error(e, self.logger)
async def _process_item(self, cb_name, item): """ 处理数据管道 """ try: for pipe in self.setting.item_pipelines: item = pipe(item) if isinstance(item, Coroutine): item = await item if item is None: self._msg['item_dropped'] += 1 except Exception as e: pretty_error(e, self.logger) else: # 更新 Item 信息 self._msg['items'] += 1 self._msg['item_speed'] = self._msg['items'] / ( self._msg['runtime'] or 1) if cb_name not in self._msg['yield_item_map'].keys(): self._msg['yield_item_map'][cb_name] = 0 self._msg['yield_item_map'][cb_name] += 1 if self.setting.redis_msg: await self._write_msg_to_redis()
async def run_spider(self, background_task: BackgroundTasks, name): spiders = self._get_spider(name=name) if name is None and not spiders: self.logger.error('Spiders map is null') return if not spiders: self.logger.error('Cannot find spider: {}'.format(name)) return try: background_task.add_task(self._run_spiders, spiders) except KeyboardInterrupt: self.logger.warning('KeyboardInterrupt') try: sys.exit(0) except SystemExit: os._exit(0) except Exception as e: pretty_error(e, self.logger) return await self.spider_info(name)
def error_callback(self, req, error): pretty_error(error, self.logger) if error.__class__.__name__ == 'TimeoutError': self.logger.warning('RequestTimeout: {}'.format(req)) return None