Ejemplo n.º 1
0
	def retrive_word(self, word):
		# 找出 DocID 对应的 url
		manager = documentManager()
		collection = manager.connect_mongo()

		id_list = []
		for word in self.word_dictionary[word]:
			url = collection.find_one({"DocID": int(word[0])})["url"]
			id_list.append(int(word[0]))
		return id_list
Ejemplo n.º 2
0
	def caculate_BM25(self, query_words):
		manager = documentManager()
		collection = manager.connect_mongo()
		
		score_dictionary = {}
		b = 0.5 #参数调节因子
		k = 10 # 调节因子
		avdl = 800 # 文档平均长度

		# query_words 中至少一个单元词出现的所有文档
		DocId_of_query_words = set([])
		for word in query_words.split(' '):

			if not self.word_dictionary.has_key(word):
				continue

			for posting in self.word_dictionary[word]:
				DocID = posting[0]
				DocId_of_query_words.add(DocID)
		
		for id in DocId_of_query_words:
			BM25_score = 0
			for word in query_words.split(' '):
				content = collection.find_one({"DocID": int(id)})["content"]
				freq = self.get_wordcount_in_document(word ,content)
				
				doc_len = len(self.word_dictionary[word])
				idf = math.log(float(100) / doc_len)
				normalizer = 1 - b + b * (doc_len / avdl) 

				BM25_score += (float)((k + 1) * freq) / (freq + k * normalizer) * idf
			# 计算某个文档对 Query 的 BM25 分数 
			score_dictionary[id] = BM25_score

		score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True)

		for i in score:
			print self.DocID2Doc(int(i[0]))
Ejemplo n.º 3
0
	def process_all_documents(self):
		manager = documentManager()
		collection = manager.connect_mongo()
		for loop in range(1, 101):
			text = collection.find_one({"DocID": loop})["content"]
			self.count_words(text, loop)
Ejemplo n.º 4
0
 def process_all_documents(self):
     manager = documentManager()
     collection = manager.connect_mongo()
     for loop in range(1, 101):
         text = collection.find_one({"DocID": loop})["content"]
         self.count_words(text, loop)
Ejemplo n.º 5
0
	def DocID2Doc(self, DocID):
		manager = documentManager()
		collection = manager.connect_mongo()
		url = collection.find_one({"DocID": DocID})["url"]
		return url