Example #1
0
 async def put_item(self, item: _Request):
     if isinstance(item, _Request):
         if item.not_filter:
             self._queue.dumps_nowait(item)
         elif not self.filter_request(item):
             self._queue.dumps_nowait(item)
     else:
         logger.info('put item: {}'.format(item))
Example #2
0
 async def from_request(self, req):
     try:
         self.request_count += 1
         resp = await self.request(url=req.url,
                                   method=req.method,
                                   **req.info)
         return _Response.from_response(req, resp, self)
     except (TimeoutError, ConnectionError) as e:
         logger.info('Request: {} and Error: {}'.format(req, e))
Example #3
0
 async def dispather(self, task_id):
     """
     dispather: get request from queue and put into downloader
     """
     with await self.semaphore:
         while self.is_running:
             try:
                 item = await asyncio.wait_for(self.get_item(),
                                               self.queue_timeout)
                 logger.info('task_id: {} and request: {}'.format(
                     task_id, item))
                 if isinstance(item, _Request):
                     resp = await self.downloader(item)
                     if isinstance(resp, _Response):
                         await asyncio.ensure_future(self.parser(resp))
             except asyncio.TimeoutError:
                 pass
Example #4
0
    async def downloader(self, item: _Request):
        """
        downloader: requests -> request_middleware_func -> downloader -> response_middleware_func -> return
        """

        logger.info('run downloader')
        logger.info('request: {}'.format(item))

        self.add_request(item)
        if self._middleware_funcs.get('request'):
            item = await self._run_middleware(item, 'request')
        if item:
            if isinstance(item, _Request):
                item = await self.from_request(item)
            if self._middleware_funcs.get('response'):
                item = await self._run_middleware(item, 'response')

        logger.info('response: {}'.format(item))
        logger.info('end downloader')

        return item
Example #5
0
    async def parser(self, response: _Response):
        """
        parser: parse response and get request or process model data
        """
        logger.info('run parser')
        logger.info('response: {}'.format(response))
        if len(self._rules) > 0:
            for rule in self._rules:
                result = rule.search(response)
                if result:
                    await self.async_put_item(result)

        callback = getattr(response.current_request, 'callback', None)
        if isclass(callback):
            callback = callback()
            callback.load(response)
            callback = callback.process(response)
        elif isfunction(callback):
            callback = callback(response)
        await self.async_put_item(callback)

        logger.info('end parser')
Example #6
0
    def run(self):
        """
        process: main program
        """
        logger.info('START SPIDER')
        start_time = datetime.now()
        try:
            logger.info('run init_spider')
            self.loop.run_until_complete(self.init())
            logger.info('end init_spider')

            tasks = asyncio.wait(
                [self.dispather(taskid) for taskid in range(self.async_limit)])
            logger.info('run main_spider')
            self.loop.run_until_complete(tasks)
            logger.info('end main_spider')

        except KeyboardInterrupt:
            logger.info('keyboard cancel all tasks')
            for task in asyncio.Task.all_tasks():
                task.cancel()
            self.loop.run_forever()
        finally:
            self.close()
            self.loop.close()
            logger.info('Request Count: {}'.format(self.request_count))
            logger.info('Time Usage: {}'.format(datetime.now() - start_time))
            logger.info('CLOSE SPIDER')