def __init__(self, settings , stats , crawler ): super(ProxyMeshMiddleware,self).__init__(settings,stats,crawler) self.proxies = ProxyAuth(settings,crawler) self.resp_headers = {} self.prev_prox = None self.authorized = set() self.getting_new = None self.fiveo9ct = 0 self.proxies.next_proxy()
class ProxyMeshMiddleware(ProxyPoolMiddlewareBase): proxy_name = "proxy-mesh" def __init__(self, settings , stats , crawler ): super(ProxyMeshMiddleware,self).__init__(settings,stats,crawler) self.proxies = ProxyAuth(settings,crawler) self.resp_headers = {} self.prev_prox = None self.authorized = set() self.getting_new = None self.fiveo9ct = 0 self.proxies.next_proxy() def process_response( self, request, response, spider): if "retry" in request.meta: self.stats.inc_value('%s/retry_proxy_domain_success' % self.proxy_name) return super(ProxyMeshMiddleware,self)._process_response(request, response, spider) def record_good_ip( self, domain , ip): self.stats.inc_value('%s/success' % self.proxy_name) super( ProxyMeshMiddleware , self ).record_good_ip(domain,ip) @classmethod def from_crawler(cls, crawler): obj = cls(crawler.settings, crawler.stats, crawler) crawler.signals.connect(obj.spider_opened, signals.spider_opened) crawler.signals.connect(obj.spider_closed, signals.spider_closed) crawler.signals.connect(obj.proxies.spider_opened, signals.spider_opened) crawler.signals.connect(obj.proxies.spider_closed, signals.spider_closed) crawler.proxies = obj return add_downloadmw_to_crawler(obj,crawler) def _sanitize_headers(self , request ): return request def get_next_proxy(self, prox_set = None , authorized = None ): self.proxies.next_proxy() self._old_stats = (self.bad_ips,self.good_ips) for domain in self.domains: self.good_ips[domain] = {} self.bad_ips[domain] = {} return def spider_closed(self, spider): super(ProxyMeshMiddleware,self).spider_closed(spider) self.stats.set_value('%s/good_ips/%s' % (self.proxy_name,spider.name),self.good_ips) self.stats.set_value('%s/bad_ips/%s' % (self.proxy_name,spider.name),self.bad_ips) def get_proxy( self , url , prev = False, encode = True ): protocol = urlparse(url).scheme or "http" cprox = self.proxies.current_proxy if encode: auth = self.proxies.basic_auth else: auth = self.proxies.user_pass if not cprox.get(protocol.startswith(protocol)): return (protocol+"://"+cprox.get(protocol),auth) else: return (cprox.get(protocol),auth) def filter_ips( self, request, domain ): # set IP's to avoid avoid_ips = self.avoid_ips( domain ) if len(avoid_ips): request.headers["X-ProxyMesh-Not-IP"] = ",".join(avoid_ips) def set_proxy( self , request , domain ): super(ProxyMeshMiddleware,self).set_proxy( request, domain ) self.filter_ips( request, domain ) def is_proxy_error( self , response ): """ Proxy mesh provides a simple header for us to read if the proxy is the source of the error code """ return 'X-ProxyMesh-Error' in response.headers # # SCRAPY API Handlers start here # def get_ip_used( self , response , request ): return response.headers.get('X-Proxymesh-Ip') def handle_500( self, request, response, spider ): return self.set_retry( request , response ) def handle_502( self , request, response , spider ): if not self.is_proxy_error( response ): raise self.NextHandler self.log("proxy-error" , response , "No available proxies at current time given constraints provided %s" % (response.headers) , level="ERROR" ) domain = self.domain_parser( response.url ) self.stats.inc_value('%s/fail' % self.proxy_name) if "X-ProxyMesh-IP-Not-Found" in response.headers: request.headers["X-ProxyMesh-Prefer-IP"] = self.get_good_ip(domain) if "X-ProxyMesh-IP" in request.headers: del request.headers["X-ProxyMesh-IP"] request.headers.pop("X-ProxyMesh-Not-IP",None) return self.set_retry( request , response , True ) return response def handle_503( self , request , response , spider ): if not self.is_proxy_error( response ): raise self.NextHandler self.stats.inc_value('%s/fail' % self.proxy_name) self.log("proxy-error" , response, "status [%d]: Failed to authorize proxy service with message [%s]" % (response.status,response.body), level="ERROR" ) domain = self.domain_parser( response.url ) if len(self.bad_ips[domain]) == 10: # prevent repeated calls if request.meta["prox_set"] > self.getting_new: self.get_next_proxy() else: sleep(10) self.set_proxy( request , domain ) return self.set_retry( request , response ) return response def handle_509( self , request , response , spider ): if not self.is_proxy_error( response ): raise self.NextHandler self.stats.inc_value('%s/fail' % self.proxy_name) self.fiveo9ct += 1 if self.fiveo9ct > 2: self.log( "proxy-error" , response , "status [%d]: Proxy Usage over Bandwidth Limits with message [%s]" % (response.status,response.body) , level = "ERROR" ) raise NotConfigured return self.set_retry( request , response , True ) def handle_402( self , request , response , spider ): if not self.is_proxy_error( response ): raise self.NextHandler # request = response.request prox_set = self.get_proxy( request.url ) domain = self.domain_parser( response.url ) self.log("proxy-error" ,response ,"status [%d]: Failed to authorize proxy service with message [%s]" % (response.status,response.body) , level="ERROR" ) if prox_set: proxy_address,user_pass = prox_set if not request.meta["proxy"] == proxy_address: self.set_proxy( request , domain ) return self.set_retry( request , response , True ) # in the case where we switched to a new proxy, but some request got through before the engine paused # self.log( "refresh-proxies" , "Calling ProxyMesh API to reload proxy list" , level = "WARNING" ) if self.proxies.proxies: self.get_next_proxy() self.set_proxy( request , domain ) return self.set_retry( request , response , True ) else: raise NotConfigured def handle_407( self , request , response , spider ): return self.handle_402( request , response , spider ) def handle_404( self , request , response , spider ): return response def set_ip( self , request , domain ): if domain in self.good_ips: goodip = self.get_good_ip(domain) if goodip: request.headers['X-ProxyMesh-IP'] = goodip def handle_400( self, request, response, spider ): ip = self.get_ip_used(response,request) is_banned,by_counting = self.check_ban( ip , request , response , spider ) domain = self.domain_parser(response.url) reloaded = False self.log( "proxy-mesh-403" , response , "handle 403 proxy-mesh [%s] [%s] [%s] [%s]" % (is_banned,response,request,domain) , level="DEBUG") if is_banned or by_counting: self.record_bad_ip(domain, ip) self.stats.inc_value('%s/proxy_domain_banned' % self.proxy_name) avoid_ips = self.avoid_ips(domain) if is_banned: self.log("proxy-banned" ,response, "status [%d]: IP -%s- banned by host with message [%s]" % (response.status,ip,response.body) , level="WARNING" ) if len( avoid_ips ) > 8: self.get_next_proxy() if "retry" in request.meta: self.stats.inc_value('%s/retry_proxy_domain_fail'% self.proxy_name) else: self.stats.inc_value('%s/retry_proxy_domain'% self.proxy_name) request.headers.pop("X-ProxyMesh-Not-IP") return self.set_retry( request , response , False ) else: self.log("proxy-banned" , response , "status [%d]: IP -%s- possibly banned by host with message [%s]" % (response.status,ip,response.body) , level="WARNING" ) self.filter_ips( request , domain ) if "retry" in request.meta: self.stats.inc_value('%s/retry_proxy_domain_fail'% self.proxy_name) else: self.stats.inc_value('%s/retry_proxy_domain'% self.proxy_name) return self.set_retry( request , response , False ) else: del request.meta["proxy-handler"] return response def handle_405( self, request, response, spider ): domain = self.domain_parser( request.url ) https_url = re.sub("http\:","https:", response.url) if self.get_proxy(https_url): request.url = https_url return self.set_retry( request, response ) del request.meta["proxy-handler"] return response def handle_error(self, request, response, spider): del request.meta["proxy-handler"] return response