Ejemplo n.º 1
0
    def open_spider(self, spider):
        self.enabled = self.is_enabled(spider)
        self.spider = spider

        for k, type_ in self._settings:
            setattr(self, k, self._get_setting_value(spider, k, type_))

        self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS',
                                                  {}).items()
        self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

        if not self.enabled and not self.force_enable_on_http_codes:
            return

        if not self.apikey:
            logging.warning("Crawlera can't be used without a APIKEY",
                            extra={'spider': spider})
            return

        self._proxyauth = self.get_proxyauth(spider)

        logging.info(
            "Using crawlera at %s (apikey: %s)" % (self.url, self.apikey[:7]),
            extra={'spider': spider},
        )

        if not self.preserve_delay:
            # Setting spider download delay to 0 to get maximum crawl rate
            spider.download_delay = 0
            logging.info(
                "CrawleraMiddleware: disabling download delays on Scrapy side to optimize delays introduced by Crawlera. "
                "To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly",
                extra={'spider': spider},
            )
Ejemplo n.º 2
0
    def process_response(self, request, response, spider):
        if not self._is_enabled_for_request(request):
            return response
        key = self._get_slot_key(request)
        self._restore_original_delay(request)

        if self._is_no_available_proxies(response):
            self._set_custom_delay(request, next(self.exp_backoff))
        else:
            self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

        if self._is_banned(response):
            self._bans[key] += 1
            if self._bans[key] > self.maxbans:
                self.crawler.engine.close_spider(spider, 'banned')
            else:
                after = response.headers.get('retry-after')
                if after:
                    self._set_custom_delay(request, float(after))
            self.crawler.stats.inc_value('crawlera/response/banned')
        else:
            self._bans[key] = 0
        # If placed behind `RedirectMiddleware`, it would not count 3xx responses
        self.crawler.stats.inc_value('crawlera/response')
        self.crawler.stats.inc_value('crawlera/response/status/%s' %
                                     response.status)
        crawlera_error = response.headers.get('X-Crawlera-Error')
        if crawlera_error:
            self.crawler.stats.inc_value('crawlera/response/error')
            self.crawler.stats.inc_value('crawlera/response/error/%s' %
                                         crawlera_error.decode('utf8'))
        return response
Ejemplo n.º 3
0
    def process_response(self, request, response, spider):
        if not self._is_enabled_for_request(request):
            return response
        key = self._get_slot_key(request)
        self._restore_original_delay(request)

        if self._is_no_available_proxies(response):
            self._set_custom_delay(request, next(self.exp_backoff))
        else:
            self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

        if self._is_banned(response):
            self._bans[key] += 1
            if self._bans[key] > self.maxbans:
                self.crawler.engine.close_spider(spider, 'banned')
            else:
                after = response.headers.get('retry-after')
                if after:
                    self._set_custom_delay(request, float(after))
            self.crawler.stats.inc_value('crawlera/response/banned')
        else:
            self._bans[key] = 0
        # If placed behind `RedirectMiddleware`, it would not count 3xx responses
        self.crawler.stats.inc_value('crawlera/response')
        self.crawler.stats.inc_value('crawlera/response/status/%s' % response.status)
        crawlera_error = response.headers.get('X-Crawlera-Error')
        if crawlera_error:
            self.crawler.stats.inc_value('crawlera/response/error')
            self.crawler.stats.inc_value(
                'crawlera/response/error/%s' % crawlera_error.decode('utf8'))
        return response
Ejemplo n.º 4
0
    def process_response(self, request, response, spider):
        if not self._is_enabled_for_request(request):
            return self._handle_not_enabled_response(request, response)

        if not self._is_crawlera_response(response):
            return response

        key = self._get_slot_key(request)
        self._restore_original_delay(request)

        if self._is_no_available_proxies(response) or self._is_auth_error(response):
            if self._is_no_available_proxies(response):
                reason = 'noslaves'
            else:
                reason = 'autherror'
            self._set_custom_delay(request, next(self.exp_backoff), reason=reason)
        else:
            self.crawler.stats.inc_value('crawlera/delay/reset_backoff')
            self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

        if self._is_auth_error(response):
            # When crawlera has issues it might not be able to authenticate users
            # we must retry
            retries = request.meta.get('crawlera_auth_retry_times', 0)
            if retries < self.max_auth_retry_times:
                return self._retry_auth(response, request, spider)
            else:
                self.crawler.stats.inc_value('crawlera/retries/auth/max_reached')
                logging.warning(
                    "Max retries for authentication issues reached, please check auth"
                    " information settings",
                    extra={'spider': self.spider},
                )

        if self._is_banned(response):
            self._bans[key] += 1
            if self._bans[key] > self.maxbans:
                self.crawler.engine.close_spider(spider, 'banned')
            else:
                after = response.headers.get('retry-after')
                if after:
                    self._set_custom_delay(request, float(after), reason='banned')
            self.crawler.stats.inc_value('crawlera/response/banned')
        else:
            self._bans[key] = 0
        # If placed behind `RedirectMiddleware`, it would not count 3xx responses
        self.crawler.stats.inc_value('crawlera/response')
        self.crawler.stats.inc_value('crawlera/response/status/%s' % response.status)
        crawlera_error = response.headers.get('X-Crawlera-Error')
        if crawlera_error:
            self.crawler.stats.inc_value('crawlera/response/error')
            self.crawler.stats.inc_value(
                'crawlera/response/error/%s' % crawlera_error.decode('utf8'))
        return response
Ejemplo n.º 5
0
    def open_spider(self, spider):
        self.enabled = self.is_enabled(spider)
        if not self.enabled:
            return

        for k, type_ in self._settings:
            setattr(self, k, self._get_setting_value(spider, k, type_))

        self._proxyauth = self.get_proxyauth(spider)
        logging.info("Using crawlera at %s (apikey: %s)" % (
            self.url,
            self.apikey[:7])
        )

        if not self.preserve_delay:
            # Setting spider download delay to 0 to get maximum crawl rate
            spider.download_delay = 0
            logging.info(
                "CrawleraMiddleware: disabling download delays on Scrapy side to optimize delays introduced by Crawlera. "
                "To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly")

        self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
        self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)