Beispiel #1
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault('encoding', self.encoding)
     return Response.replace(self, *args, **kwargs)
    def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
        try:
            crawlera_meta = request.meta[META_KEY]
        except KeyError:
            crawlera_meta = {}

        if crawlera_meta.get("skip") or not crawlera_meta.get("original_request"):
            return response

        original_request = request_from_dict(crawlera_meta["original_request"], spider=spider)

        self.stats.inc_value("crawlera_fetch/response_count")
        self._calculate_latency(request)

        self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(response.status))

        if response.headers.get("X-Crawlera-Error"):
            message = response.headers["X-Crawlera-Error"].decode("utf8")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/{}".format(message))
            log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.warning(log_msg)
                return response

        try:
            json_response = json.loads(response.text)
        except json.JSONDecodeError as exc:
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/JSONDecodeError")
            log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                exc.msg,
                exc.lineno,
                exc.colno,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg) from exc
            else:
                logger.warning(log_msg)
                return response

        server_error = json_response.get("crawlera_error") or json_response.get("error_code")
        original_status = json_response.get("original_status")
        request_id = json_response.get("id") or json_response.get("uncork_id")
        if server_error:
            message = json_response.get("body") or json_response.get("message")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/{}".format(server_error))
            log_msg = (
                "Error downloading <{} {}> (Original status: {}, "
                "Fetch API error message: {}, Request ID: {})"
            )
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                original_status or "unknown",
                message,
                request_id or "unknown",
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.warning(log_msg)
                return response

        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status))

        crawlera_meta["upstream_response"] = {
            "status": response.status,
            "headers": response.headers,
            "body": json_response,
        }
        try:
            resp_body = base64.b64decode(json_response["body"], validate=True)
        except (binascii.Error, ValueError):
            resp_body = json_response["body"]

        respcls = responsetypes.from_args(
            headers=json_response["headers"],
            url=json_response["url"],
            body=resp_body,
        )
        return response.replace(
            cls=respcls,
            request=original_request,
            headers=json_response["headers"],
            url=json_response["url"],
            body=resp_body,
            status=original_status or 200,
        )
Beispiel #3
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault('encoding', self.encoding)
     return Response.replace(self, *args, **kwargs)
Beispiel #4
0
    def process_response(self, request: Request, response: Response,
                         spider: Spider) -> Response:
        try:
            crawlera_meta = request.meta[META_KEY]
        except KeyError:
            crawlera_meta = {}

        if crawlera_meta.get(
                "skip") or not crawlera_meta.get("original_request"):
            return response

        original_request = request_from_dict(crawlera_meta["original_request"])

        self.stats.inc_value("crawlera_fetch/response_count")
        self._calculate_latency(request)

        self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(
            response.status))

        if response.headers.get("X-Crawlera-Error"):
            message = response.headers["X-Crawlera-Error"].decode("utf8")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/{}".format(message))
            log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.error(log_msg)
                return response

        try:
            json_response = json.loads(response.text)
        except json.JSONDecodeError as exc:
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/JSONDecodeError")
            log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                exc.msg,
                exc.lineno,
                exc.colno,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg) from exc
            else:
                logger.error(log_msg)
                return response

        if json_response.get("crawlera_error"):
            error = json_response["crawlera_error"]
            message = json_response["body"]
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/{}".format(error))
            log_msg = (
                "Error downloading <{} {}> (Original status: {}, Fetch API error message: {})"
            )
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                json_response["original_status"],
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.error(log_msg)
                return response

        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(
            json_response["original_status"]))

        crawlera_meta["upstream_response"] = {
            "status": response.status,
            "headers": response.headers,
            "body": json_response,
        }
        respcls = responsetypes.from_args(
            headers=json_response["headers"],
            url=json_response["url"],
            body=json_response["body"],
        )
        return response.replace(
            cls=respcls,
            request=original_request,
            headers=json_response["headers"],
            url=json_response["url"],
            body=json_response["body"],
            status=json_response["original_status"],
        )
Beispiel #5
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault("encoding", getattr(self, "_encoding", None))
     return Response.replace(self, *args, **kwargs)
Beispiel #6
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault('encoding', self.encoding)
     return Response.replace(
         self, *args, **kwargs)  # 原来如此,这里的self可太骚了. 将自己作为instance传递过去而来