コード例 #1
0
ファイル: fetcher.py プロジェクト: mattgu74/ic05
	def __init__(self, robot, queue_in, queue_out, max_depth, *, db_host, db_name, db_async=False, db_location='local'):
		threading.Thread.__init__(self, name="Fetcher-%s"%id(self))
		self.robot = robot
		self.queue_in = queue_in
		self.queue_out = queue_out
		self.max_depth = max_depth
		
		self.urlhandler = UrlHandler(robot=self.robot, max_tries=5)

		self.gephiAPI = []
		for host,port in GEPHI_HOSTS:
			self.gephiAPI.append(GephiAPI(host, port))

		if db_location=='local':
			self.dbAPI = MongodbAPI(db_host, db_name)
		else:
			self.dbAPI = OpaAPI(db_host)

		self.e_stop = threading.Event()

		self._is_working = threading.Event()

		# statistics
		self.nb_opened = 0
		self.nb_saved = 0
		self.stats = {
			'process': [0,0],
			'urlhandler': [0,0],
			'extractor': [0,0],
			'gephi': [0,0],
			'db': [0,0]
		}

		# current url
		self.url = ""

		# shall the db send commands without waiting answer
		self.db_async = db_async
コード例 #2
0
ファイル: fetcher.py プロジェクト: mattgu74/ic05
class Fetcher(threading.Thread):
	def __init__(self, robot, queue_in, queue_out, max_depth, *, db_host, db_name, db_async=False, db_location='local'):
		threading.Thread.__init__(self, name="Fetcher-%s"%id(self))
		self.robot = robot
		self.queue_in = queue_in
		self.queue_out = queue_out
		self.max_depth = max_depth
		
		self.urlhandler = UrlHandler(robot=self.robot, max_tries=5)

		self.gephiAPI = []
		for host,port in GEPHI_HOSTS:
			self.gephiAPI.append(GephiAPI(host, port))

		if db_location=='local':
			self.dbAPI = MongodbAPI(db_host, db_name)
		else:
			self.dbAPI = OpaAPI(db_host)

		self.e_stop = threading.Event()

		self._is_working = threading.Event()

		# statistics
		self.nb_opened = 0
		self.nb_saved = 0
		self.stats = {
			'process': [0,0],
			'urlhandler': [0,0],
			'extractor': [0,0],
			'gephi': [0,0],
			'db': [0,0]
		}

		# current url
		self.url = ""

		# shall the db send commands without waiting answer
		self.db_async = db_async

	def stop(self):
		self.e_stop.set()
		self.dbAPI.stop()

	def is_working(self):
		return self._is_working.is_set()

	def wait_free(self, timeout=None):
		self._is_working.wait(timeout)
	
	def run(self):
		while not self.e_stop.is_set():
			try:
				params = self.queue_in.get(True, 0.5)
			except:
				pass
			else:
				time.sleep(2)
				url = params['url']
				depth = params['depth']
				self.exc_and_get_stats(
					name="process",
					target=self._process,
					args=(url,depth)
				)

	def _process(self,url,depth):
		self._is_working.set()
		if self.url_need_a_visit(url):
			html = self.exc_and_get_stats(
					name="urlhandler",
					target=self.get_html,
					args=(url,)
				)
			if html:
				self.nb_opened += 1
				extractor = self.exc_and_get_stats(
					name='extractor',
					target=self.extract,
					args=(html, url)
				)
				if extractor:
					links = extractor.links
					keywords = extractor.keywords
					self.process_result(depth+1, url, links, keywords)
		else:
			print("pas besoin de visiter", url)
		self._is_working.clear()

	def process_result(self, depth, url, links, keywords):
		# gephi
		self.exc_and_get_stats(
			name='gephi',
			target=self.process_result_gephi,
			args=(url, links, keywords)
		)
		# db
		self.exc_and_get_stats(
			name='db',
			target=self.process_result_db,
			args=(url, links, keywords)
		)
		# queue
		if isinstance(self.dbAPI, OpaAPI):
			if self.queue_out.qsize() < 10:
				to_visit = self.dbAPI.get_urls_to_visit(10)
				for link in to_visit:
					result = {'url':link, 'depth':depth}
					self.queue_out.put(result)
		elif depth < self.max_depth:
			for link in links:
				result = {'url':link, 'depth':depth}
				self.queue_out.put(result)

	def process_result_gephi(self, url, links, keywords):
		for api in self.gephiAPI:
			color = {'r':0.0, 'g':1.0, 'b':1.0}
			api.add_node(url, **color)
			api.add_nodes(map(lambda l: (l,color), links))
			api.add_edges(map(lambda l: (url, l, {}), links))

	def process_result_db(self, url, links, keywords):
		lprint("SAVE", url)
		self.nb_saved += 1
		r = self.dbAPI.add_links(url=url, links=links, safe=(not self.db_async))
		#lprint(r)

	def get_html(self, url):
		"""
		Récupérer le contenu d'une page
		"""
		self.url = url		# pour savoir quelle url est entrain d'évaluer le fetcher
		try:
			stream = self.urlhandler.open(url, None)
		except ExceptionUrlForbid as ex:
			lprint("ERROR", ex)
		except ExceptionMaxTries as ex:
			lprint("ERROR", ex)
		except Exception as ex:
			lprint(url, ex, "\n"+get_traceback())
		else:
			lprint("OPENED", url)
			html = ""
			try:
				html = stream.read().decode(errors='replace')
			except Exception as ex:
				lprint(get_traceback(), "\n", url, ex, )
			else:
				stream.close()
			return html

	def extract(self, html, url):
		"""
		Extraires les choses importantes d'une page (liens, mots clefs, ...)
		"""
		try:
			extractor = Extractor(url, html)
		except Exception as ex:
			lprint("ERROR", self.__class__.__name__, "extract :", ex, url, "\n"+get_traceback())
		else:
			return extractor
		
	def url_need_a_visit(self, url):
		#return True
		p = urllib.parse.urlparse(url)
		if p.scheme in ('http','https'):
			return self.dbAPI.url_need_a_visit(url)
		else:
			return False			

	def exc_and_get_stats(self, *, name, target, args=(), kwargs={}):
		start = time.time()
		r = target(*args,**kwargs)
		ellapsed = time.time() - start
		self.stats[name][0] += ellapsed
		self.stats[name][1] += 1
		return r
		
	def __line(self, name, val, tab):
		s_tab = "\t"*tab
		return "{:.<40}{}\n".format(s_tab+name,val)
		
	def __repr__(self):
			
		s = "{:*^60}\n".format(self.name)
		s += self.__line("Working", self.is_working(), 0)
		s += self.__line("CurrentUrl", self.url, 0)
		s += self.__line("TotOpened", self.nb_opened, 0)
		s += self.__line("TotSaved", self.nb_saved, 0)
		s += "Stats:\n"
		for k, (t,n) in self.stats.items():
			if n:
				val = "{:.3f}s ({}iters)".format(t/n, n)
				s += self.__line(k, val, 1)
		return s