Python EarlyVisitHandler Beispiele

Programmiersprache: Python

Namespace / Paketname: strategies.earlyvisithandler

Klasse / Typ: EarlyVisitHandler

Beispiele auf hotexamples.com: 2

Python EarlyVisitHandler - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die strategies.earlyvisithandler.EarlyVisitHandler, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

add_entry(1)

check_visited(1)

Beispiel #1

Datei anzeigen

Datei: engine.py Projekt: derrick0714/web_search_engine

	def __init__( self):
		self._istart		= False
		self._status		= Status()

		"""--- load config file----"""
		self._config 		= Configuration();
	
		"""--- core object ----"""
		self._downloader	= None
		self._parser		= None

		"""--- memory models --- """
		self._download_pool	= SafeQueue() #Store the html objects to be downloaded by the downloader
		self._parse_pool	= SafeQueue() #Store the html objects to be parsed by the parser
		
		"""--- checker threads --- """
		"""The target is the function passed in to 
		run in the thread. Those two threads keep checking 
		and assigning jobs to the two thread pools"""
		self._downloader_pool_checker = Thread( target=self.download_pool_checker)
		self._parse_pool_checker = Thread( target=self.parse_pool_checker)
		
		"""---  threads --- """
		self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql

		""" ---strategies--- """
		self._earlyvisithandler	=	EarlyVisitHandler()
		self._robothandler  	=	RobotHandler()
		self._cgihandler		=	CGIHandler()
		self._nestlevelhandler 	=	NestLevelHandler()
		self._schemehandler    	=	SchemeHandler()
		self._filetypehandler	=	FileTypeHandler()
		self._bookmarkhandler	=	BookMarkHandler()
		self._omitindex			=	OmitIndex()
		self._urlextender		=	URLExtender()			
	
		""" ---init the path for saving data, if the folder don't exist, create it ---"""
		self._path			= self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/"
		if not os.path.exists(self._path):
			os.makedirs(self._path)

		self._config._down_path = self._path
		
		self._keywords_links= []

		""" ---Mysql Manager--- """
		self.sqlex      = DatabseManager(self._config)

Beispiel #2

Datei anzeigen

Datei: engine.py Projekt: derrick0714/web_search_engine

class Engine(object):
	def __init__( self):
		self._istart		= False
		self._status		= Status()

		"""--- load config file----"""
		self._config 		= Configuration();
	
		"""--- core object ----"""
		self._downloader	= None
		self._parser		= None

		"""--- memory models --- """
		self._download_pool	= SafeQueue() #Store the html objects to be downloaded by the downloader
		self._parse_pool	= SafeQueue() #Store the html objects to be parsed by the parser
		
		"""--- checker threads --- """
		"""The target is the function passed in to 
		run in the thread. Those two threads keep checking 
		and assigning jobs to the two thread pools"""
		self._downloader_pool_checker = Thread( target=self.download_pool_checker)
		self._parse_pool_checker = Thread( target=self.parse_pool_checker)
		
		"""---  threads --- """
		self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql

		""" ---strategies--- """
		self._earlyvisithandler	=	EarlyVisitHandler()
		self._robothandler  	=	RobotHandler()
		self._cgihandler		=	CGIHandler()
		self._nestlevelhandler 	=	NestLevelHandler()
		self._schemehandler    	=	SchemeHandler()
		self._filetypehandler	=	FileTypeHandler()
		self._bookmarkhandler	=	BookMarkHandler()
		self._omitindex			=	OmitIndex()
		self._urlextender		=	URLExtender()			
	
		""" ---init the path for saving data, if the folder don't exist, create it ---"""
		self._path			= self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/"
		if not os.path.exists(self._path):
			os.makedirs(self._path)

		self._config._down_path = self._path
		
		self._keywords_links= []

		""" ---Mysql Manager--- """
		self.sqlex      = DatabseManager(self._config)

		#self.f= open("data.txt", 'w')

	def load_seeds(self):
		#load seed info from config file	
		#print "load_seeds 1"
		#load seed from 
		contacter = SearchGoogle(self._config._keywords, self._config._result_num)
		self._keywords_links = contacter.getURLs()
		#append seeds, which from google search result, into download pool
		#print "load_seeds 2"
		#self._keywords_links.insert(0, "https://twitter.com/")
		#self._keywords_links.insert(0, "https://twitter.com/signup?context=login")
		
		i = 0
		for url in self._keywords_links:
			if i < self._config._result_num:
				#print "@@{0}".format(url)
				html_task = Html(url)

				#print "@@1"
				if(self._schemehandler.SchemeChecker(html_task)==False):
					#print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@2"
					self._status._scheme+=1
					continue
				if(self._bookmarkhandler.BookMarkChecker(html_task)==True):
					#print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@3"
					self._status._bookmark+=1
					continue
				if(self._cgihandler.FindCGI(html_task)==True):
					#print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@4"
					self._status._cgi+=1
					continue
				if(self._nestlevelhandler.checknestlevel(html_task,self._config._parser_nlv)==True):
					self._status._nestlv +=1
					#print "@@5"
					#print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				if(self._filetypehandler.FileTypeChecker(html_task)==False):
					#print "@@6"
					self._status._file_type +=1
					continue
				#print "@@7"
				'''
				if(self._earlyvisithandler.check_visited(html_task) == True):
					self._status._early_visit +=1
					#print("Ingore the link visited before, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				'''
				self._omitindex.Omit(html_task)
				"""
				print "@@8"
				if(self._robothandler.is_allowed(html_task) == False):
					print "@@9"
					self._status._robot +=1
					#print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				print "@@10"
				"""
				self._earlyvisithandler.add_entry(html_task._md5, html_task)
				self._download_pool.append(html_task)
				'''If use the following two line of code, then the program won't run, which means checking for revisit works'''
				'''however, the dic should be safe with a lock'''
				#self._visited_dic[html_task._md5] = html_task._url 
				#print(len(self._visited_dic))
				#print "@@11"
			else:

				break
			i+=1
		#print "load_seeds 3"
	def show_welcome(self):
		print("download folder:"+self._path)
		print "key words:"+self._config._keywords
		print "donload thread num: {0}".format(self._config._down_num)
		print "parse thread num: {0}".format(self._config._parser_num)
		print "Load " +str(self._config._result_num)+" results from google search:"
		
		i = 0
		for url in self._keywords_links:
			if i < self._config._result_num:
				print ("[{0}]".format(i)+url)
			i+=1
		print "\n------------------------------------------------------------------------\n"

		#raw_input("press any key to start crawling, press second key to stop")
	
	def wait_for_start(self):
		print "ready for start....."
		print "go to http://dengxu.me/crawling/ to input some key words & see the result "

		while( self.sqlex.read_if_start(self._config)!= True):
			sleep(1)
		print "\n------------------------------------------------------------------------\n"
		print "starting crawling engine...."


	def start(self):
		try:
			self.wait_for_start()

			self._istart = True
			
			"""load seed """
			self.load_seeds()	#load seeds from google search 

			
			"""show welcome info"""
			self.show_welcome()
			self._status._sys_start	= time()

			"""start threads"""
			self._downloader = Downloader( self._config._down_num, self._status)
			self._downloader.start()
			self._parser     = Parser(self._config._parser_num, self._status )
			self._parser.start()
			self._downloader_pool_checker.start()
			self._parse_pool_checker.start()
			self._status_update.start()


			"""notify mysql, i am started"""
			self.sqlex.write_if_start()
			
		except (Exception) as e:
			Log().debug("start failed")
			raise(e)
			return False

		
		
	def stop(self):
		self._istart = False
		""""clear download and parse popl"""
		self._download_pool.clear()
		self._parse_pool.clear()

		"""stop downloader and parser threads"""
		self._downloader.stop()
		self._parser.stop()
		""""Those two checker threads will end when the thread who calls them ends"""
		self._downloader_pool_checker.join()
		self._parse_pool_checker.join()
		self._status_update.join()
		print ("Engine is stopping")

	def pause(self):
		pass

	def finish_download(self, html_task):
			
		
		
		
		sentence = "Downloaded:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-code: {4} data-size: {5}byes url: {6}"\
			.format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\
		html_task._parent,html_task._return_code, html_task._data_size, html_task._url )

		#if self._status._download_times <= 500 :
		#	self.f.write(sentence+"\n")
			


		"""caculate the path for saving files"""
		full_path = self._path+"[No.{0}]_".format(self._status._download_times)+".html"

		"""save html data to files"""
		#f= open(full_path, 'w')
		#f.write(html_task._data)
		#f.close()


		"""After downloading, pass the data(still using the html objects) to the parse pool"""
		self._parse_pool.append(html_task)




	def finish_parse(self, html_task):
		'''
		print("parsed:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-status: {4} data-size: {5}byes url:{6}"\
			.format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\
		html_task._parent,html_task._return_code, html_task._data_size, html_task._url))
		'''
		"""After parsing, pass the urls to be downloaded to the download pool"""
		if(self._earlyvisithandler.check_visited(html_task) == True):
			#print("Ingore the link visited before, this link is within page {0} , so don't put it in queue".format(html_task._parent), html_task._url)
			self._status._early_visit +=1
			return
		if(self._robothandler.is_allowed(html_task) == False):
			#print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
			self._status._robot +=1
			return
		
		self._earlyvisithandler.add_entry(html_task._md5, html_task)
		self._download_pool.append(html_task)
		




	def download_pool_checker(self):
		while (self._istart == True):
			new_download_task = self._download_pool.pop_left()
			"""If there is no task remain in the download pool, put the thread into sleep"""
			"""else pop the new task, and download it"""
			"""for the engine to get the result to put into the parse pool, we need to pass the function finish_download down as a callback"""
			
			if (new_download_task == None):
				#print("No task remaining in download_pool")
				sleep(0.1)
			else:
				self._downloader.queue_download_task(new_download_task , self.finish_download)


	def parse_pool_checker(self):
		while (self._istart == True):
			new_parse_task = self._parse_pool.pop_left()
			if (new_parse_task == None):
				#print("sleeping")
				sleep(0.1)				
			else:

				self._parser.queue_parse_task(new_parse_task, self.finish_parse)





	#~~~see result at http://dengxu.me/crawling/
	def status_update(self):

		while (self._istart == True):

			self._status._download_queue = self._downloader.len()
			self._status._parse_queue = self._parser.len()
			
			
			sentence = "[time: {0:0.1f}],queue:{8}, down: {1}, total: {2:0.1f}MB | queue:{9}, parsed: {3},scheme:{10}, cig: {4}, bookmark: {11} type {12} visited: {5}, robot: {6},nestlv: {7} | error: 404: {13} , timeout: {14}"\
			.format( time()-self._status._sys_start,\
		 	self._status._download_times, float(self._status._download_size)/1024/1024, self._status._parse_times\
		 	,self._status._cgi, self._status._early_visit, self._status._robot, self._status._nestlv\
		 	,self._downloader.len(), self._parser.len(),self._status._scheme_type, self._status._bookmark, self._status._file_type\
		 	,self._status._404,self._status._socket_timeout)
			
			print sentence

			#if( self._status._download_times > 500):
			#	self.f.write( sentence+"\n")
			

			"""update status tp mysql"""
			self.sqlex.write_status(self._status)
			
			"""update recent download url"""
			self.sqlex.write_recent_download(self._status)
			
			sleep(1)