Beispiel #1
0
	def __init__(self, settings , stats , crawler ):
		super(ProxyMeshMiddleware,self).__init__(settings,stats,crawler)
		self.proxies = ProxyAuth(settings,crawler)
		self.resp_headers = {}
		self.prev_prox = None
		self.authorized = set()
		self.getting_new = None
		self.fiveo9ct = 0
		self.proxies.next_proxy()
Beispiel #2
0
class ProxyMeshMiddleware(ProxyPoolMiddlewareBase):

	proxy_name = "proxy-mesh"

	def __init__(self, settings , stats , crawler ):
		super(ProxyMeshMiddleware,self).__init__(settings,stats,crawler)
		self.proxies = ProxyAuth(settings,crawler)
		self.resp_headers = {}
		self.prev_prox = None
		self.authorized = set()
		self.getting_new = None
		self.fiveo9ct = 0
		self.proxies.next_proxy()

	def process_response( self, request, response, spider):
		if "retry" in request.meta:
			self.stats.inc_value('%s/retry_proxy_domain_success' % self.proxy_name)
		return super(ProxyMeshMiddleware,self)._process_response(request, response, spider)
		

	def record_good_ip( self, domain , ip):
		self.stats.inc_value('%s/success' % self.proxy_name)
		super(	ProxyMeshMiddleware , self ).record_good_ip(domain,ip)	
	
	@classmethod
	def from_crawler(cls, crawler):	  
		obj = cls(crawler.settings, crawler.stats, crawler)
		crawler.signals.connect(obj.spider_opened, signals.spider_opened)
		crawler.signals.connect(obj.spider_closed, signals.spider_closed)
		crawler.signals.connect(obj.proxies.spider_opened, signals.spider_opened)	
		crawler.signals.connect(obj.proxies.spider_closed, signals.spider_closed)	
		crawler.proxies = obj
		return add_downloadmw_to_crawler(obj,crawler)
	
	def _sanitize_headers(self , request ):
					 
		return request
		
	
	def get_next_proxy(self, prox_set = None , authorized = None ):
		self.proxies.next_proxy()
		self._old_stats = (self.bad_ips,self.good_ips)
		for domain in self.domains:
			self.good_ips[domain] = {}
			self.bad_ips[domain] = {}
		return

	def spider_closed(self, spider):
		super(ProxyMeshMiddleware,self).spider_closed(spider)
		self.stats.set_value('%s/good_ips/%s' % (self.proxy_name,spider.name),self.good_ips)
		self.stats.set_value('%s/bad_ips/%s' % (self.proxy_name,spider.name),self.bad_ips)	
		
		
	def get_proxy( self , url , prev = False, encode = True ):
		protocol = urlparse(url).scheme or "http"
		cprox = self.proxies.current_proxy
		if encode:
			auth = self.proxies.basic_auth
		else:
			auth = self.proxies.user_pass
		if not cprox.get(protocol.startswith(protocol)):
			return (protocol+"://"+cprox.get(protocol),auth)
		else:
			return (cprox.get(protocol),auth)
			

	def filter_ips( self, request, domain ):
		# set IP's to avoid
		avoid_ips = self.avoid_ips( domain )
		if len(avoid_ips):
			request.headers["X-ProxyMesh-Not-IP"] = ",".join(avoid_ips)
		
	
	def set_proxy( self , request , domain ):
		super(ProxyMeshMiddleware,self).set_proxy( request, domain ) 
		self.filter_ips( request, domain )

	def is_proxy_error( self , response ):
		"""
		Proxy mesh provides a simple header for us to read if the proxy is the source of the error code
		"""
		return 'X-ProxyMesh-Error' in response.headers
	
		
	#
	# SCRAPY API Handlers start here
	#
	

	def get_ip_used( self , response , request ):
		return response.headers.get('X-Proxymesh-Ip') 
		
	
	def handle_500( self, request, response, spider ):

		return self.set_retry( request , response )
	
		
	def handle_502( self , request, response , spider ):
		
		if not self.is_proxy_error( response ):
			
			raise self.NextHandler
			
		self.log("proxy-error" , response , "No available proxies at current time given constraints provided %s" % (response.headers) , level="ERROR"  ) 
		
		domain = self.domain_parser( response.url )

		self.stats.inc_value('%s/fail' % self.proxy_name)

		if "X-ProxyMesh-IP-Not-Found" in  response.headers:
			request.headers["X-ProxyMesh-Prefer-IP"] = self.get_good_ip(domain) 
			if "X-ProxyMesh-IP" in request.headers:
				del request.headers["X-ProxyMesh-IP"]
			request.headers.pop("X-ProxyMesh-Not-IP",None)
			return self.set_retry( request , response  , True )
		return response
	
	def handle_503( self , request , response , spider ):
		if not self.is_proxy_error( response ):
			raise self.NextHandler
		self.stats.inc_value('%s/fail' % self.proxy_name) 
		
		self.log("proxy-error" , response, "status [%d]: Failed to authorize proxy service with message [%s]" % (response.status,response.body), level="ERROR" ) 
			
		domain = self.domain_parser( response.url )

		if len(self.bad_ips[domain]) == 10: 
			# prevent repeated calls
			if request.meta["prox_set"] > self.getting_new:
				self.get_next_proxy()
			else:
				sleep(10)
			self.set_proxy( request , domain )
			return self.set_retry( request , response )
		return response


	def handle_509( self , request , response , spider ):
		if not self.is_proxy_error( response ):
			raise self.NextHandler

		self.stats.inc_value('%s/fail' % self.proxy_name)
		self.fiveo9ct += 1
		if self.fiveo9ct > 2:
			self.log( "proxy-error" , response , "status [%d]: Proxy Usage over Bandwidth Limits with message [%s]" % (response.status,response.body) , level = "ERROR" )
			raise NotConfigured
		return self.set_retry( request	, response , True )
	
			
	def handle_402( self , request , response , spider ):
		if not self.is_proxy_error( response ):
			raise self.NextHandler		
#		request = response.request

		prox_set = self.get_proxy( request.url )
		
		domain = self.domain_parser( response.url )

		self.log("proxy-error" ,response ,"status [%d]: Failed to authorize proxy service with message [%s]" % (response.status,response.body) , level="ERROR" ) 

		if prox_set:
			proxy_address,user_pass = prox_set
			if not request.meta["proxy"] == proxy_address:
				self.set_proxy( request , domain )
				return self.set_retry( request , response , True )
		
		# in the case where we switched to a new proxy, but some request got through before the engine paused
		# 
		self.log( "refresh-proxies" , "Calling ProxyMesh API to reload proxy list" ,  level = "WARNING"	 )
		if self.proxies.proxies:
			self.get_next_proxy()
			self.set_proxy( request , domain )
			return self.set_retry( request , response , True )
		else:
			raise NotConfigured
	
	def handle_407( self , request , response , spider ):
		return self.handle_402( request , response , spider )
		
	def handle_404( self , request , response , spider ):
		return response
	
	def set_ip( self , request , domain ):
		if domain in self.good_ips: 
			goodip = self.get_good_ip(domain)
			if goodip:
				request.headers['X-ProxyMesh-IP'] = goodip
				
	def handle_400( self, request, response, spider ):
		ip = self.get_ip_used(response,request)
		is_banned,by_counting = self.check_ban( ip , request , response , spider )
		domain = self.domain_parser(response.url)
		reloaded = False
		self.log( "proxy-mesh-403" , response , "handle 403 proxy-mesh [%s] [%s] [%s] [%s]" % (is_banned,response,request,domain) , level="DEBUG")
		if is_banned or by_counting:
			self.record_bad_ip(domain, ip)
			self.stats.inc_value('%s/proxy_domain_banned' % self.proxy_name) 
			avoid_ips = self.avoid_ips(domain)
			if is_banned:
				self.log("proxy-banned" ,response, "status [%d]: IP -%s-  banned by host with message [%s]" % (response.status,ip,response.body)	, level="WARNING" )
				if len( avoid_ips ) > 8:
					self.get_next_proxy() 
					if "retry" in request.meta:
						self.stats.inc_value('%s/retry_proxy_domain_fail'% self.proxy_name)
					else:
						self.stats.inc_value('%s/retry_proxy_domain'% self.proxy_name)
					request.headers.pop("X-ProxyMesh-Not-IP")			
					return self.set_retry( request , response , False )
			else:
				self.log("proxy-banned"	 , response , "status [%d]: IP -%s- possibly banned by host with message [%s]" % (response.status,ip,response.body) , level="WARNING"  ) 
				
			self.filter_ips( request , domain )

			if "retry" in request.meta:
				self.stats.inc_value('%s/retry_proxy_domain_fail'% self.proxy_name)
			else:
				self.stats.inc_value('%s/retry_proxy_domain'% self.proxy_name)
			return self.set_retry( request , response , False )
		else:
			del request.meta["proxy-handler"]
			return response
	

	def handle_405( self, request, response, spider ):
		
		
		domain = self.domain_parser( request.url )
		
		https_url = re.sub("http\:","https:", response.url)
	
		if self.get_proxy(https_url):
			request.url = https_url
			return self.set_retry( request, response )
	
		del request.meta["proxy-handler"]
		
		return response
						
	def handle_error(self, request, response, spider):

		del request.meta["proxy-handler"]
		return response