def buildModel(jsonFile, fieldNames, query_str): # iterable 不能循环两次,所以创建两个变量 t1 = jsonutil.iterCutFieldList(jsonFile, fieldNames) t2 = jsonutil.iterCutFieldList(jsonFile, fieldNames) # 建立单词索引字典 dictionary = corpora.Dictionary(t1) dictionary.save(DICTIONARY_PATH) # 建立词袋模型.将词汇表示的文本,转换成用id表示 corpus = [dictionary.doc2bow(text) for text in t2] print("词袋: %i " % len(corpus)) bm25Model = bm25.BM25(corpus) # print("bm25 idf lens: %i " %len(bm25Model.f)) average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) query = jiebautil.cutWords(query_str).split() query_bow = dictionary.doc2bow(query) scores = bm25Model.get_scores(query_bow, average_idf) # i = scores.index(max(scores)) lineRead = LineReader(jsonFile) for i in range(5): score = max(scores) lineNum = scores.index(score) + 1 s = lineRead.load(lineNum) j = json.loads(s) print(jsonutil.recursive_get(j, fieldNames[0])) del scores[lineNum-1]
def search(words): """ 搜索包含关键词的文本 :param words: :return: """ results = jsonutil.iterJsonValue(JSON_FILE, [QUESTION_FIELD, ANSWER_FIELD]) i = 1 for result in results: s = " ".join(result.values()) strList = jiebautil.cutWords(s).split() if all(w in strList for w in words): print(i) for k in result: print("".join(result[k].split())) print("\n") i += 1
def getValley(self, str): print(str) data = json.loads(str) answerKeys = self.answerField.split(".") question = jsonutil.recursive_get(data, self.questionField) answer = jsonutil.recursive_get(data, self.answerField) cutStr = cutWords(question) reservedWords = self.valleyDict.extractTags(question) reservedWords, conceptsUsedStr = self.concepts.extractTags( reservedWords) self.valleyRecieved.emit(question + answer, cutStr, " ".join(reservedWords), conceptsUsedStr, self.lineReader.currentLine) self.saveRunPoint()
def extract_keywords(self, sentence, topK=5): # 提取关键词 # 分词 seg_list = jiebautil.cutWords(sentence).split() freq = {} for w in seg_list: freq[w] = freq.get(w, 0.0) + 1.0 # 统计词频 if '' in freq: del freq[''] total = sum(freq.values()) # 总词数 for k in freq: # 计算 TF-IDF freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序 if topK: # 返回topK return tags[:topK] else: return tags
def querySimString(jsonFile, fieldName, sentence): """ 取得和sentence相似的句子 :param sentence: :return: """ dictionary = corpora.Dictionary.load(DICTIONARY_PATH) lsi = models.LsiModel.load(LSI_MODEL, mmap='r') query = jiebautil.cutWords(sentence).split() #将词转换成id query_bow = dictionary.doc2bow(query) query_lsi = lsi[query_bow] index = similarities.MatrixSimilarity.load(INDEX_PATH) sims = index[query_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) for e in sims[:5]: lineRead = LineReader(jsonFile) s = lineRead.load(e[0] + 1) j = json.loads(s) print(jsonutil.recursive_get(j, fieldName))
def cutWords(sentence): print(jiebautil.cutWords(sentence))