async def put_item(self, item: _Request): if isinstance(item, _Request): if item.not_filter: self._queue.dumps_nowait(item) elif not self.filter_request(item): self._queue.dumps_nowait(item) else: logger.info('put item: {}'.format(item))
async def from_request(self, req): try: self.request_count += 1 resp = await self.request(url=req.url, method=req.method, **req.info) return _Response.from_response(req, resp, self) except (TimeoutError, ConnectionError) as e: logger.info('Request: {} and Error: {}'.format(req, e))
async def dispather(self, task_id): """ dispather: get request from queue and put into downloader """ with await self.semaphore: while self.is_running: try: item = await asyncio.wait_for(self.get_item(), self.queue_timeout) logger.info('task_id: {} and request: {}'.format( task_id, item)) if isinstance(item, _Request): resp = await self.downloader(item) if isinstance(resp, _Response): await asyncio.ensure_future(self.parser(resp)) except asyncio.TimeoutError: pass
async def downloader(self, item: _Request): """ downloader: requests -> request_middleware_func -> downloader -> response_middleware_func -> return """ logger.info('run downloader') logger.info('request: {}'.format(item)) self.add_request(item) if self._middleware_funcs.get('request'): item = await self._run_middleware(item, 'request') if item: if isinstance(item, _Request): item = await self.from_request(item) if self._middleware_funcs.get('response'): item = await self._run_middleware(item, 'response') logger.info('response: {}'.format(item)) logger.info('end downloader') return item
async def parser(self, response: _Response): """ parser: parse response and get request or process model data """ logger.info('run parser') logger.info('response: {}'.format(response)) if len(self._rules) > 0: for rule in self._rules: result = rule.search(response) if result: await self.async_put_item(result) callback = getattr(response.current_request, 'callback', None) if isclass(callback): callback = callback() callback.load(response) callback = callback.process(response) elif isfunction(callback): callback = callback(response) await self.async_put_item(callback) logger.info('end parser')
def run(self): """ process: main program """ logger.info('START SPIDER') start_time = datetime.now() try: logger.info('run init_spider') self.loop.run_until_complete(self.init()) logger.info('end init_spider') tasks = asyncio.wait( [self.dispather(taskid) for taskid in range(self.async_limit)]) logger.info('run main_spider') self.loop.run_until_complete(tasks) logger.info('end main_spider') except KeyboardInterrupt: logger.info('keyboard cancel all tasks') for task in asyncio.Task.all_tasks(): task.cancel() self.loop.run_forever() finally: self.close() self.loop.close() logger.info('Request Count: {}'.format(self.request_count)) logger.info('Time Usage: {}'.format(datetime.now() - start_time)) logger.info('CLOSE SPIDER')