def process_request(self, request: Request): if request.timeout is None: request.timeout = self.settings.DEFAULT_TIMEOUT if request.headers is None: request.headers = self.settings.DEFAULT_HEADERS if request.meta is None: request.meta = {}
def make_request(self, word): params = {'index': 0, 'keyword': quote_plus(word)} url = 'https://www.creditchina.gov.cn/xinyongxinxi/index.html' return Request(url, params=params, callback=self.parse, meta={'keyword': word})
def get_detail(self, response: Response): if response.json: for com in response.json.get('data', {}).get('results', []): url = 'https://www.creditchina.gov.cn/api/credit_info_detail?encryStr={}'.format( com['encryStr']) item = CreditItem() item['encry_str'] = com['encryStr'] yield Request(url, callback=self.get_detail2, format_type='json', meta={'item': item})
def process_response(self, request: Request, response: Response): if not 200 <= response.status <= 301 and response.status not in self.settings.ALLOWED_CODES: self.logger.debug( 'The response status <{code}> is not in ALLOWED_CODES', code=response.status) retry_count = request.meta.get('retry_count', 0) retry_count += 1 request.meta['retry_count'] = retry_count return request
async def __handle_downloader_exception(self, request: Request, exception: Exception): handled_data = None for middleware in self.__middlewares: handled_data = await self.__run_task(middleware.process_exception(request, exception)) if handled_data: break if handled_data is None: await self.__job_scheduler.spawn(self.__run_task(self._scheduler.append_error_request(request))) if isfunction(request.err_callback) and hasattr(self._spider.__class__, request.err_callback.__name__): handled_data = request.err_callback(request, exception) return handled_data
def parse(self, response): page_count = 5 for page in range(1, page_count + 1): params = { 'keyword': quote_plus(response.meta.get('keyword', '')), 'templateId': '', 'page': page, 'pageSize': '10' } url = 'https://www.creditchina.gov.cn/api/credit_info_search' yield Request(url, params=params, callback=self.get_detail, format_type='json', cookies=response.cookies)
async def __handle_downloader_response(self, request: Request, response: Response): handled_data = None response = self.__downloader.__parse_html__(request, response) for middleware in self.__middlewares: handled_data = await self.__run_task(middleware.process_response(request, response)) if handled_data: if isinstance(handled_data, Response): response = handled_data break if isinstance(handled_data, Response) or handled_data is None: logger.success('Crawled ({status}) <{method} {url}>', status=response.status, method=request.method, url=request.url ) response.meta = request.meta if hasattr(self._spider.__class__, request.callback.__name__): handled_data = request.callback(response) return handled_data
def process_request(self, request: Request): request.headers['User-Agent'] = self.__ua.random