Ejemplo n.º 1
0
	def query(self, q, op='and'):
		""" Query and rank the results. Calculate the scores according to 
		both factors of the content rank and page rank. 
		Note: In this function, calculate the content rank scores in the real time
		and only get the page rank scores which are preloaded in the database. """
		db   = self.config.indexdb
		sort = int(self.config.sort)
		cr_fac = float(self.config.rankers['content'])
		pr_fac = float(self.config.rankers['page'])

		query = SimpleQuery(db)
		words = query.parse_query(q)
		urls  = query.query(q)
		
		if len(urls) == 0:
			return []

		scores = {}
		valid_fac   = {}
		valid_score = {}
		if cr_fac > 0:
			cr = ContentRanker(db, sort)
			cr_scores = cr.score(urls, words)
			cr_scores = normalize(cr_scores)
			valid_fac['content'] = cr_fac
			valid_score['content'] = cr_scores

		if pr_fac > 0:
			pr = PageRanker(db, sort)
			pr_scores = pr.score(urls, words)
			pr_scores = normalize(pr_scores)
			valid_fac['page'] = pr_fac
			valid_score['page'] = pr_scores

		for urlid in urls:
			scores[urlid] = 0.0
			try:
				for key in valid_fac.keys():
					scores[urlid] += valid_fac[key]*valid_score[key][urlid]
			except:
				ferrmsg("Error: urlid(%s) is not find in the results of each rank." % urlid, \
						'SECore')
				
		res = sorted(scores.items(), key=lambda v:v[1], reverse=sort)
		res = [t[0] for t in res]
		return res
Ejemplo n.º 2
0
	def index(self):
		""" Index the got pages, include two steps:
		1, index the pages for all page information.
		2, pageranke the page and also store the scores to
		   accelerate the query process."""
		idb  = self.config.indexdb
		odb  = self.config.oridb
		sort = int(self.config.sort)
		
		indexer = Indexer(idb)
		indexer.index(odb)
		del indexer
		
		pr = PageRanker(idb, sort)
		pr.build_links(odb)
		pr.pagerank()
		del pr