def process_response(self, request, response, spider): if not self._is_enabled_for_request(request): return response if response.status in self.ban_code: self.invaild_proxy(request.meta['proxy']) logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status)) new_request = request.copy() new_request.dont_filter = True return new_request if self.ban_re: try: pattern = re.compile(self.ban_re) except TypeError: logger.error('Wrong "ban_re", please check settings') return response match = re.search(pattern, response.body) if match: self.invaild_proxy(request.meta['proxy']) logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match)) new_request = request.copy() new_request.dont_filter = True return new_request p = request.meta['proxy'] self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1 return response
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust # 重新获取代理,添加代理 proxy_ip = "http://" + getProxy() retryreq.meta['proxy'] = proxy_ip if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) # print(retryreq) # print("*"*100) return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def baseQuery(request, lang): queryResults = [] request['action'] = 'query' lastContinue = {'continue': ''} while True: # Clone original request req = request.copy() # Modify it with the values returned in the 'continue' section of the last result. req.update(lastContinue) #Create query string #queryString = "?" #for key, value in req.items(): #value = str(value) #Convert to string in case the value is not #queryString += key + "=" + value + "&" # Call API #result = urllib.request.urlopen('http://' + lang + '.wikipedia.org/w/api.php' + queryString).read() result = requests.get('http://en.wikipedia.org/w/api.php', params=req).text return result; if 'error' in result: raise Error(result['error']) if 'warnings' in result: print(result['warnings']) if 'query' in result: #yield result['query'] queryResults.append(result['query']) if 'continue' not in result: break lastContinue = result['continue'] return queryResults
def process_exception(self, request, exception, spider): if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \ and request.meta.get('proxy', False): self.invaild_proxy(request.meta['proxy']) logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception) new_request = request.copy() new_request.dont_filter = True return new_request
def add_proxy(self, request): new_request = request.copy() new_request.meta["proxy"] = self.proxy_server new_request.headers["Proxy-Authorization"] = self.proxy_auth new_request.dont_filter = False logger.debug("Use proxy to request %s" % new_request.url) new_request.priority = new_request.priority + RETRY_PRIORITY_ADJUST time.sleep(HTTPPROXY_DELAY) return new_request
def process_exception(self, request, exception): from twisted.internet.error import TimeoutError if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \ and request.meta.get('proxy', False): self.invalid_proxy(request.meta['proxy']) logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception) new_request = request.copy() new_request.dont_filter = True return new_request elif isinstance(exception, TimeoutError): print('TimeoutError了,此时返回request') # return request elif isinstance(exception, TypeError): print('TypeError,此时返回request')
def __call__(self, request): requests_request = request.copy() destination_urls = self._create_forwarded_urls(requests_request.url) # Use gzip even if the original requestor didn't support it requests_request.headers['Accept-Encoding'] = 'gzip,identity' # Host header is automatically added for each request by grequests del requests_request.headers['Host'] requests = (grequests.request(requests_request.method, destination_url, data=requests_request.body, headers=requests_request.headers, allow_redirects=False, verify=True) for destination_url in destination_urls) exhandler = partial(helpers.log_failures, environ=request.environ) requests_responses = grequests.map(requests, stream=True, exception_handler=exhandler) self._log_responses(request, requests_responses) requests_responses = self._filter_responses(requests_responses) response = None if None in requests_responses: response = MetadataResponse(request, requests_responses) response.response.status = 504 elif ('Proxy-Aggregator-Body' in request.headers and request.headers['Proxy-Aggregator-Body'].lower() == 'response-metadata'): response = MetadataResponse(request, requests_responses) elif len(requests_responses) == 1: response = SingleResponse(request, requests_responses[0]) elif any(r.status_code >= 400 for r in requests_responses): response = ErrorResponse(request, requests_responses, self._priority_errors) else: response = MultipleResponse(request, requests_responses) return response.response
def download_image(self, request): """Attempt to download an image from the grabber's URL. By default, this will attempt to use the built in ``get_image_from_url(url)`` method, however a different callable can be chosen by setting the grabber's ``download_callable`` attribute. The returned "result" dict contains the downloaded image (if download was successful), information about any errors, the requested URL, etc., which should be enough for result handlers to do their thing. The callable used as the downloader must have the following signtature:: def some_downloader(url, grabber): Args: dict: The "request" dictionary. Returns: dict: A "result" dictionary. """ url = request['url'] download_callable = self.get_download_callable() result = request.copy() result['error'] = None im = None try: im = download_callable(url, self) except Exception as e: if not self.ignore_download_exception(e): raise e result['error'] = e result['image'] = im return result