Ejemplo n.º 1
0
    def process_request(self, request: Request):
        if request.timeout is None:
            request.timeout = self.settings.DEFAULT_TIMEOUT
        if request.headers is None:
            request.headers = self.settings.DEFAULT_HEADERS

        if request.meta is None:
            request.meta = {}
Ejemplo n.º 2
0
 def make_request(self, word):
     params = {'index': 0, 'keyword': quote_plus(word)}
     url = 'https://www.creditchina.gov.cn/xinyongxinxi/index.html'
     return Request(url,
                    params=params,
                    callback=self.parse,
                    meta={'keyword': word})
Ejemplo n.º 3
0
 def get_detail(self, response: Response):
     if response.json:
         for com in response.json.get('data', {}).get('results', []):
             url = 'https://www.creditchina.gov.cn/api/credit_info_detail?encryStr={}'.format(
                 com['encryStr'])
             item = CreditItem()
             item['encry_str'] = com['encryStr']
             yield Request(url,
                           callback=self.get_detail2,
                           format_type='json',
                           meta={'item': item})
Ejemplo n.º 4
0
    def process_response(self, request: Request, response: Response):
        if not 200 <= response.status <= 301 and response.status not in self.settings.ALLOWED_CODES:
            self.logger.debug(
                'The response status <{code}> is not in ALLOWED_CODES',
                code=response.status)

            retry_count = request.meta.get('retry_count', 0)
            retry_count += 1

            request.meta['retry_count'] = retry_count
            return request
Ejemplo n.º 5
0
    async def __handle_downloader_exception(self, request: Request, exception: Exception):
        handled_data = None
        for middleware in self.__middlewares:
            handled_data = await self.__run_task(middleware.process_exception(request, exception))
            if handled_data:
                break

        if handled_data is None:
            await self.__job_scheduler.spawn(self.__run_task(self._scheduler.append_error_request(request)))
            if isfunction(request.err_callback) and hasattr(self._spider.__class__, request.err_callback.__name__):
                handled_data = request.err_callback(request, exception)

        return handled_data
Ejemplo n.º 6
0
 def parse(self, response):
     page_count = 5
     for page in range(1, page_count + 1):
         params = {
             'keyword': quote_plus(response.meta.get('keyword', '')),
             'templateId': '',
             'page': page,
             'pageSize': '10'
         }
         url = 'https://www.creditchina.gov.cn/api/credit_info_search'
         yield Request(url,
                       params=params,
                       callback=self.get_detail,
                       format_type='json',
                       cookies=response.cookies)
Ejemplo n.º 7
0
    async def __handle_downloader_response(self, request: Request, response: Response):
        handled_data = None
        response = self.__downloader.__parse_html__(request, response)
        for middleware in self.__middlewares:
            handled_data = await self.__run_task(middleware.process_response(request, response))
            if handled_data:
                if isinstance(handled_data, Response):
                    response = handled_data
                break

        if isinstance(handled_data, Response) or handled_data is None:
            logger.success('Crawled ({status}) <{method} {url}>',
                           status=response.status,
                           method=request.method,
                           url=request.url
                           )

            response.meta = request.meta
            if hasattr(self._spider.__class__, request.callback.__name__):
                handled_data = request.callback(response)

        return handled_data
Ejemplo n.º 8
0
 def process_request(self, request: Request):
     request.headers['User-Agent'] = self.__ua.random