Beispiel #1
0
	def normalize(self, scraper):
		""" normalize this req with using the provided scraper's config """
		if self.is_normalized:
			return self

		self.scraper = scraper  

		#copy scraper-wide options if not set yet   
		self.options = common.combine_dicts(scraper.config, self.options)

		req = self

		self.url = common.normalize_url(self.url)
		# self.url = str(self.url)

		accept_error_codes = req.get('accept_error_codes')
		if accept_error_codes is None:
			accept_error_codes = []
			req.set('accept_error_codes', accept_error_codes)

		#default headers
		user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox
		
		if user_agent == 'random':
			user_agent = agent.random_agent()

		headers = {
			"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"User-Agent": user_agent,
			"Accept-Language": "en-us,en;q=0.5",
			"Accept-Encoding": "gzip, deflate",         
			# "Connection": "close" #turn off keep-alive
			"Connection": "keep-alive"
		}

		if req.post:
			headers.update({"Content-Type": "application/x-www-form-urlencoded"})
			
		#update user-passed in headers
		if req.get('headers'):
			if req.get('merge_headers') is not False:
				#merge user defined headers with default headers
				headers.update(req.get('headers')) 
			else:
				#only use user defined headers
				headers = req.get('headers')    


		req.set('headers', headers)
			
		proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url)
		if proxy and req.get('proxy_url_filter'):
			#check if this url is qualified for using proxy
			if not re.compile(req.get('proxy_url_filter')).findall(req.url):
				#failed
				proxy = ''
				logger.debug('proxy not used for url: %s', req.url)

		req.set('proxy', proxy)
		
		
		#normalise the post
		if req.post and isinstance(req.post, common.MyDict):
			req.post = req.post.dict()
		if req.post and isinstance(req.post, dict):
			req.post = urllib.urlencode(sorted(req.post.items()))

		self.is_normalized = True   
		
		return self 
Beispiel #2
0
	def normalize(self, scraper):
		""" normalize this req with using the provided scraper's config """
		if self.is_normalized:
			return self

		#copy scraper-wide options if not set yet	
		self.options = common.combine_dicts(scraper.config, self.options)

		req = self

		self.url = common.normalize_url(self.url)
		# self.url = str(self.url)

		accept_error_codes = req.get('accept_error_codes')
		if accept_error_codes is None:
			accept_error_codes = []
			req.set('accept_error_codes', accept_error_codes)

		#default headers
		user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox
		
		if user_agent == 'random':
			user_agent = agent.random_agent()

		headers = {
			"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"User-Agent": user_agent,
			"Accept-Language": "en-us,en;q=0.5",
			"Accept-Encoding": "gzip, deflate",			
			# "Connection": "close" #turn off keep-alive
			"Connection": "keep-alive"
		}

		if req.post:
			headers.update({"Content-Type": "application/x-www-form-urlencoded"})
			
		#update user-passed in headers
		if req.get('headers'):
			if req.get('merge_headers') is not False:
				#merge user defined headers with default headers
				headers.update(req.get('headers')) 
			else:
				#only use user defined headers
				headers = req.get('headers')	


		req.set('headers', headers)
			
		proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url)
		if proxy and req.get('proxy_url_filter'):
			#check if this url is qualified for using proxy
			if not re.compile(req.get('proxy_url_filter')).findall(req.url):
				#failed
				proxy = ''
				logger.debug('proxy not used for url: %s', req.url)

		req.set('proxy', proxy)
		
		
		#normalise the post
		if req.post and isinstance(req.post, common.MyDict):
			req.post = req.post.dict()
		if req.post and isinstance(req.post, dict):
			req.post = urllib.urlencode(sorted(req.post.items()))

		self.is_normalized = True	
		
		return self