Example #1
0
 def _close(self):
     try:
         self.close()
     except Exception as e:
         pretty_error(e, self.logger)
     finally:
         self._close_msg()
Example #2
0
 async def request_finger(self, req):
     url = req.url
     try:
         args = [canonicalize_url(url)]
         for arg in ('data', 'files', 'auth', 'cert', 'json', 'cookies'):
             if req.__dict__.get(arg):
                 args.append(req.__dict__.get(arg))
         finger = get_md5(*args)
     except Exception as e:
         pretty_error(e, self.logger)
     else:
         if isinstance(self.setting.request_filter, set):
             if finger not in self.setting.request_filter:
                 self.setting.request_filter.add(finger)
                 return req
             else:
                 self.logger.warning("filter: {}".format(req))
         elif hasattr(self.setting.request_filter, 'sadd'):
             if await self.setting.request_filter.sadd(
                     self.setting.request_filter_key, finger):
                 return req
             else:
                 self.logger.warning("filter: {}".format(req))
         else:
             self.logger.warning('Invalid request filter type: {}'.format(
                 self.setting.request_filter))
Example #3
0
    async def _downloader(self):
        """
        请求调度函数
        """
        try:
            self.prepare()
            await self._init_()
            req_list = []
            while not self.stop:
                req = await self.setting.request_queue.pop()
                if req: req_list.append(self.async_request(req))
                if len(
                        req_list
                ) >= self.setting.request_batch_size or await self.setting.request_queue.empty(
                ):
                    # 异步请求
                    resp_list = await asyncio.gather(*req_list)
                    # 处理响应
                    await asyncio.gather(*[
                        self._process_response(resp) for resp in resp_list
                        if resp is not None
                    ])
                    req_list.clear()

                if await self.setting.request_queue.empty(): self._stop -= 1
        except Exception as e:
            pretty_error(e, self.logger)
        finally:
            if self.setting.aiohttp_clientsession:
                await self.setting.aiohttp_clientsession.close()
            if self.setting.redis_msg: await self._write_msg_to_redis()
            if self.setting.clear_filter: await self._clear_filter()
            self._close()
Example #4
0
 async def _write_msg_to_redis(self):
     try:
         await self.setting.redis_client.set('Spider:{}'.format(self.name),
                                             json.dumps(self.msg))
     except Exception as e:
         if e.__class__.__name__ == 'ConnectionError':
             logger.warning('Redis 连接失败')
         else:
             pretty_error(e, self.logger)
Example #5
0
 async def _process_middleware(self, resq, middlewares):
     if not middlewares: return resq
     try:
         for mid in middlewares:
             resq = mid(resq)
             if isinstance(resq, Coroutine): resq = await resq
             if not resq: return
     except Exception as e:
         pretty_error(e, self.logger)
     else:
         return resq
Example #6
0
 def start(self):
     try:
         self.loop.run_until_complete(self._downloader())
     except KeyboardInterrupt:
         self.logger.warning('KeyboardInterrupt')
         self._close()
         try:
             sys.exit(0)
         except SystemExit:
             os._exit(0)
     except Exception as e:
         pretty_error(e, self.logger)
         self._close()
Example #7
0
 async def _process_response(self, resp):
     # 调用响应中间件
     resp = await self._process_middleware(
         resp, self.setting.response_middlewares)
     if resp is None:
         self._msg['response_dropped'] += 1
         return
     else:
         try:
             for r in await self._process_callback(resp):
                 await self._process_return(resp.request.callback.__name__,
                                            r)
         except Exception as e:
             pretty_error(e, self.logger)
         finally:
             self._stop -= 1
Example #8
0
    async def _init_(self):

        # assert params
        if callable(self.setting.item_pipelines):
            self.setting.item_pipelines = [self.setting.item_pipelines]

        assert isinstance(self.setting.item_pipelines, Iterable), \
            'ITEM_PIPELINE type error: except function or function list, get {}.'.format(self.setting.item_pipelines)

        for pipe in self.setting.item_pipelines:
            assert callable(pipe), 'ITEM_PIPELINE({}) not callable'.format(
                pipe)

        if self.setting.request_middlewares is not None:
            if callable(self.setting.request_middlewares):
                self.setting.request_middlewares = [
                    self.setting.request_middlewares
                ]
            self._check_middlewares(self.setting.request_middlewares)

        if self.setting.response_middlewares is not None:
            if callable(self.setting.response_middlewares):
                self.setting.response_middlewares = [
                    self.setting.response_middlewares
                ]
            self._check_middlewares(self.setting.response_middlewares)

        # init request queue
        self._msg['callback_runtime_map'][self.start_requests.__name__] = (
            time.time(), 0)
        try:
            request_list = self.start_requests()
        except Exception as e:
            pretty_error(e, self)
        else:
            if not request_list: return
            if not isinstance(request_list, Iterable):
                request_list = [request_list]
            for r in request_list:
                if not r: continue
                await self._process_return(self.start_requests.__name__, r)
            start_time = self._msg['callback_runtime_map'].get(
                self.start_requests.__name__)[0]
            end_time = time.time()
            self._msg['callback_runtime_map'][self.start_requests.__name__] = (
                start_time, end_time, human_time(end_time - start_time))
Example #9
0
 async def _process_callback(self, resp):
     """
     处理回调函数
     """
     try:
         if isinstance(resp, list):
             result = resp[0].request.callback(resp)
         else:
             result = resp.request.callback(resp, *resp.request.cb_args,
                                            **resp.request.cb_kwargs)
     except Exception as e:
         pretty_error(e, self.logger)
         return []
     else:
         if isinstance(result, Coroutine): result = await result
         if not result: return []
         if not isgenerator(result): result = [result]
         return result
Example #10
0
    def start(self, name=None):
        spiders = self._get_spider(name=name)

        if name is None and not spiders:
            self.logger.error('Spiders map is null')
            return

        if not spiders:
            self.logger.error('Cannot find spider: {}'.format(name))
            return

        try:
            self.loop.run_until_complete(self._run_spiders(spiders))
        except KeyboardInterrupt:
            self.logger.warning('KeyboardInterrupt')
            try:
                sys.exit(0)
            except SystemExit:
                os._exit(0)
        except Exception as e:
            pretty_error(e, self.logger)
Example #11
0
 async def _process_item(self, cb_name, item):
     """
     处理数据管道
     """
     try:
         for pipe in self.setting.item_pipelines:
             item = pipe(item)
             if isinstance(item, Coroutine): item = await item
             if item is None:
                 self._msg['item_dropped'] += 1
     except Exception as e:
         pretty_error(e, self.logger)
     else:
         # 更新 Item 信息
         self._msg['items'] += 1
         self._msg['item_speed'] = self._msg['items'] / (
             self._msg['runtime'] or 1)
         if cb_name not in self._msg['yield_item_map'].keys():
             self._msg['yield_item_map'][cb_name] = 0
         self._msg['yield_item_map'][cb_name] += 1
         if self.setting.redis_msg: await self._write_msg_to_redis()
Example #12
0
    async def run_spider(self, background_task: BackgroundTasks, name):
        spiders = self._get_spider(name=name)

        if name is None and not spiders:
            self.logger.error('Spiders map is null')
            return

        if not spiders:
            self.logger.error('Cannot find spider: {}'.format(name))
            return

        try:
            background_task.add_task(self._run_spiders, spiders)
        except KeyboardInterrupt:
            self.logger.warning('KeyboardInterrupt')
            try:
                sys.exit(0)
            except SystemExit:
                os._exit(0)
        except Exception as e:
            pretty_error(e, self.logger)

        return await self.spider_info(name)
Example #13
0
 def error_callback(self, req, error):
     pretty_error(error, self.logger)
     if error.__class__.__name__ == 'TimeoutError':
         self.logger.warning('RequestTimeout: {}'.format(req))
     return None