def retrive_word(self, word): # 找出 DocID 对应的 url manager = documentManager() collection = manager.connect_mongo() id_list = [] for word in self.word_dictionary[word]: url = collection.find_one({"DocID": int(word[0])})["url"] id_list.append(int(word[0])) return id_list
def caculate_BM25(self, query_words): manager = documentManager() collection = manager.connect_mongo() score_dictionary = {} b = 0.5 #参数调节因子 k = 10 # 调节因子 avdl = 800 # 文档平均长度 # query_words 中至少一个单元词出现的所有文档 DocId_of_query_words = set([]) for word in query_words.split(' '): if not self.word_dictionary.has_key(word): continue for posting in self.word_dictionary[word]: DocID = posting[0] DocId_of_query_words.add(DocID) for id in DocId_of_query_words: BM25_score = 0 for word in query_words.split(' '): content = collection.find_one({"DocID": int(id)})["content"] freq = self.get_wordcount_in_document(word ,content) doc_len = len(self.word_dictionary[word]) idf = math.log(float(100) / doc_len) normalizer = 1 - b + b * (doc_len / avdl) BM25_score += (float)((k + 1) * freq) / (freq + k * normalizer) * idf # 计算某个文档对 Query 的 BM25 分数 score_dictionary[id] = BM25_score score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True) for i in score: print self.DocID2Doc(int(i[0]))
def process_all_documents(self): manager = documentManager() collection = manager.connect_mongo() for loop in range(1, 101): text = collection.find_one({"DocID": loop})["content"] self.count_words(text, loop)
def DocID2Doc(self, DocID): manager = documentManager() collection = manager.connect_mongo() url = collection.find_one({"DocID": DocID})["url"] return url