def __init__(self): # 分词工具,基于jieba分词,并去除停用词 seg = Seg() self.ss = SentenceSimilarity(seg) self.ss.restore_model() with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer: self.line = file_answer.readlines()
class Chatbot_port2(object): def __init__(self): # 分词工具,基于jieba分词,并去除停用词 seg = Seg() self.ss = SentenceSimilarity(seg) self.ss.restore_model() with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer: self.line = file_answer.readlines() def chat(self, question): question = question.strip() top_10 = self.ss.similarity(question) answer_index = top_10[0][0] answer = self.line[answer_index] return answer, top_10[0][1]
def dictTest(): dict = {} seg = Seg() original_ss = SentenceSimilarity(seg) readDictData(original_ss, dict) original_ss.TfidfModel() # original_ss.LdaModel() # original_ss.LsiModel() total_data_len = len(X_test) success_len = 0 f1 = open('ah_data_lsi.txt', 'w', encoding='utf-8') for i in range(len(X_test)): print("-------------------------------------") text = checkData(X_test[i]) text = "".join(seg.cut_for_search(text)) print("测试内容: " + text) try: sentences = original_ss.similarityArray(text) sentences = sorted(sentences, key=lambda e: e.get_score(), reverse=True) count = 0 for sentence in sentences: if sentence.get_score() > 0.9: print(sentence.get_score()) if sentence.get_score() == 1.0: count = count + 1 sentence = original_ss.similarity(text) if count < 2 and dict.get( sentence.get_origin_sentence()) == Y_test[i]: success_len = success_len + 1 else: y = Y_test[i] f1.writelines("-------------------------------------\n") f1.writelines("测试内容: " + text + "\n") for sentence in sentences: f1.writelines("匹配标签: 【" + dict.get(sentence.get_origin_sentence()) + "】 真实标签:【" + y + "】 评分: " + str(sentence.get_score()) + "\n") except Exception as e: print(e) print(success_len / total_data_len)
def printInfo(event): seg = Seg() seg.load_userdict('../userdict/userdict.txt') # 读取数据 List_kw, questionList, answerList = read_corpus1() # 初始化模型 ss = SentenceSimilarity(seg) ss.set_sentences(questionList) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 text2.delete(1.0, END) question = (text1.get('1.0', END)) #if question == 'q': #break time1 = time.time() question_k = ss.similarity_k(question, 5) text2.insert("insert", ": {}".format(answerList[question_k[0][0]])) #print(": {}".format(answerList[question_k[0][0]])) #for idx, score in zip(*question_k): # print("same questions: {}, score: {}".format(questionList[idx], score)) #time2 = time.time() #cost = time2 - time1 #print('Time cost: {} s'.format(cost)) #entry2.insert(10,question) #清空entry2控件 text1.delete(1.0, END) syn(": {}".format(answerList[question_k[0][0]]))
class kuakuaChat(): def __init__(self): """ 初始化夸夸话题回复表 """ self.qa_dict = {} self.q_list = [] with open('./douban_kuakua_topic.txt', 'r', encoding='utf8') as in_file: for line in in_file.readlines(): que = line.split('<######>')[0].strip() ans_list = [] for ans in line.split('<######>')[-1].split('<$$$$$$>'): if len(ans) > 2: ans_list.append(ans) if len(que) > 5: self.q_list.append(que) self.qa_dict[que] = ans_list zhcn_seg = zhcnSeg() self.sent_sim = SentenceSimilarity(zhcn_seg) self.sent_sim.set_sentences(self.q_list) # 默认用tfidf self.sent_sim.TfidfModel() def answer_question(self, question_str): """ 返回与输入问句最相似的问句的固定回答 :param question_str: :return: """ most_sim_questions = self.sent_sim.similarity_top_k(question_str, 4) answer_list = [] for item in most_sim_questions: answer = self.qa_dict[item[0]] answer_list += answer return answer_list
def __init__(self): """ 初始化夸夸话题回复表 """ self.qa_dict = {} self.q_list = [] with open('./douban_kuakua_topic.txt', 'r', encoding='utf8') as in_file: for line in in_file.readlines(): que = line.split('<######>')[0].strip() ans_list = [] for ans in line.split('<######>')[-1].split('<$$$$$$>'): if len(ans) > 2: ans_list.append(ans) if len(que) > 5: self.q_list.append(que) self.qa_dict[que] = ans_list zhcn_seg = zhcnSeg() self.sent_sim = SentenceSimilarity(zhcn_seg) self.sent_sim.set_sentences(self.q_list) # 默认用tfidf self.sent_sim.TfidfModel()
def tf(): dt = {} # if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"train_data.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"test_data.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 # w=open("result510tf.txt",'w') # w.write(str("source_id") + '\t' + str("target_id") + '\n') for i in range(len(test1_sentences)): print "*********************" print i print test1_sentences[i] test = str(test1_sentences[i].encode("utf-8")) t = test.split(',')[0] dict = ss.similarity(test1_sentences[i]) # dict的key为句子的(序号-1),value为计算出的距离 for k, v in dict: print t, k + 1, v # 如2784 2784 1.0 ind2 = k + 1 if (str(k + 1) == str(t)): print "same" else: # w.write(str(t) + '\t' + str(k+1) + '\n') addtodict2(dt, int(t), int(ind2), v) # w.close() return dt
def run_prediction(input_file_path, output_file_path): # 读入训练集 file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(input_file_path) test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result=open(output_file_path,'w') with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer: line = file_answer.readlines() for i in range(0,len(test_sentences)): top_15 = ss.similarity(test_sentences[i]) ''' for j in range(0,len(top_15)): answer_index=top_15[j][0] answer=line[answer_index] file_result.write(str(top_15[j][1])+'\t'+str(answer)) file_result.write("\n") ''' file_result.write(line[top_15[0][0]]+'\n') file_result.close() file_answer.close()
def main(question, top_k, task='faq'): # 读取数据 if task == 'chat': qList_kw, questionList, answerList = read_corpus2() else: qList_kw, questionList, answerList = read_corpus1() """简单的倒排索引""" # 计算倒排表 invertTable = invert_idxTable(qList_kw) inputQuestionKW = seg.cut(question) # 利用关键词匹配得到与原来相似的问题集合 questionList_s, answerList_s = filter_questionByInvertTab( inputQuestionKW, questionList, answerList, invertTable) # 初始化模型 ss = SentenceSimilarity(seg) ss.set_sentences(questionList_s) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 question_k = ss.similarity_k(question, top_k) return question_k, questionList_s, answerList_s
from cutWords import Seg from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence import time from time import ctime import threading file_obj = FileObj(r"dataSet/train_q.txt") train_sentences = file_obj.read_lines() with open("dataSet/train_a.txt", 'r', encoding='utf-8') as file_answer: line = file_answer.readlines() seg = Seg() # 训练模型 ss1 = SentenceSimilarity(seg) ss1.set_sentences(train_sentences) ss1.TfidfModel() # tfidf模型 ss2 = SentenceSimilarity(seg) ss2.set_sentences(train_sentences) ss2.LsiModel() # LSI模型 def tfidf_model(sentence): top = ss1.similarity(sentence) answer_index = top[0][0] answer = line[answer_index] return top[0][1], answer
from tqdm import tqdm if __name__ == '__main__': # 读入训练集 # file_obj = FileObj(r"dataSet/trainQuestions.txt") # train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"dataSet/devQuestions.txt") test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(test_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result = open('dataSet/result.txt', 'w') with open("dataSet/trainAnswers.txt", 'r', encoding='utf-8') as file_answer: line = file_answer.readlines() for i in tqdm(range(0, len(test_sentences))): top_15 = ss.similarity(test_sentences[i]) for j in range(0, len(top_15)):
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ",fDist.N()) print("不同单词数: ",fDist.B()) fDist.plot(10) if __name__ == '__main__': # 设置外部词 seg = Seg() seg.load_userdict('./userdict/userdict.txt') # 读取数据 List_kw, questionList, answerList = read_corpus() # 初始化模型 ss = SentenceSimilarity(seg) ss.set_sentences(questionList) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 while True: question = input("请输入问题(q退出): ") if question == 'q': break time1 = time.time() question_k = ss.similarity_k(question, 5) print("亲,我们给您找到的答案是: {}".format(answerList[question_k[0][0]])) for idx, score in zip(*question_k): print("same questions: {}, score: {}".format(questionList[idx], score)) time2 = time.time()
answer_list = [x[1] for x in list_qa_dataset] numQuestions_train = len(question_list) - 100 numQuestions_test = 100 question_list_train = question_list[:numQuestions_train] question_list_test = question_list[numQuestions_train:] answer_list_train = answer_list[:numQuestions_train] answer_list_test = answer_list[numQuestions_train:] # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(question_list_train) # ss.TfidfModel() # tfidf模型 ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 for i in range(0,len(question_list_test)): sentenceK = ss.similarityK(question_list_test[i]) print ' ' print 'question: %s' % question_list_test[i] for k in range(len(sentenceK)): sentence_k = sentenceK[k] org_sentence = sentence_k.origin_sentence sentence_id = sentence_k.id
file_obj = FileObj(r"testSet/data2") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet3") test1_sentences = file_obj.read_lines() # 读入测试集2 #file_obj = FileObj(r"testSet/testSet2.txt") #test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 ss.FasttxModel() # 测试集1 right_count = 0 for i in range(0, len(train_sentences)): print(test1_sentences[i]) ss.similarity2(test1_sentences[i]) print("\r\n") # 测试集2 # right_count = 0
file_obj = FileObj(r"testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 for i in range(0,len(train_sentences)): sentence = ss.similarity(test1_sentences[i]) if i != sentence.id: print str(i) + " wrong! score: " + str(sentence.score) else: right_count += 1 print str(i) + " right! score: " + str(sentence.score)
def plot_words(wordList): fDist = FreqDist(wordList) # print(fDist.most_common()) print("单词总数: ", fDist.N()) print("不同单词数: ", fDist.B()) fDist.plot(10) if __name__ == '__main__': # 设置外部词 seg = Seg() seg.load_userdict('userdict/userdict.txt') # 添加自己的词库到默认词库中 # 读取数据 _, questionList, answerList = read_corpus() # 初始化模型 ss = SentenceSimilarity(seg) # 设置self.reg属性 ss.set_sentences(questionList) # 设置self.sentences属性,列表类型,列表中每个值为Sentence对象 ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 while True: question = input("请输入问题(q退出): ") if question == 'q': break time1 = time.time() question_k = ss.similarity_k(question, 5) print("亲,我们给您找到的答案是: {}".format(answerList[question_k[0][0]])) for idx, score in zip(*question_k): print("same questions: {}, score: {}".format( questionList[idx], score))
file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 for i in range(0,len(train_sentences)): sentence = ss.similarity(test1_sentences[i]) if i != sentence.id: print (str(i) + " wrong! score: " + str(sentence.score)) else: right_count += 1 print (str(i) + " right! score: " + str(sentence.score))
#对 important_sentence中的句子,和问题计算相似度 #读入训练集 #TODO: #现在的训练集是原来的wiki内容 + 问题 ,后面改一下训练集? train_sentence = list_sentence train_sentence.append(question) #构造测试集 test_sentence = important_sentence # 分词工具,基于jieba分词加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentence) ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 # 测试与问题的相似度 score_sentence = [] for i in range(0,len(test_sentence)): score = ss.MYsimilarity(question, test_sentence[i]) score_sentence.append(score) new_score = ss.MYsimilarity2(question, test_sentence) #输出到文件 WriteFile("sentence.txt", important_sentence)
from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/tjmsnew.txt") train_sentences = file_obj.read_lines() file_obj = FileObj(r"testSet/zhenduanxx-utf.txt") test1_sentences = file_obj.read_lines() #test1_sentences = "子宫 肌瘤" # 分词工具,基于jieba分词,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 #ss.LsiModel() # lsi模型 ss.LdaModel() # lda模型 #ss.W2Vmodel() for j in range(0, len(test1_sentences)): sentence = ss.similarity(test1_sentences[j], j) ''' # 测试集1 right_count = 0 file = open("result6.txt", "a") for j in range(0,len(test1_sentences)): sentence = ss.similarity(test1_sentences[j]) file.write(str(sentence.origin_sentence)+str(sentence.score)+"\n") file.flush()
if __name__ == '__main__': start = clock() # 读入后半部分语料 file_obj = FileObj(r"sentence2.txt") train_sentences = file_obj.read_lines() # 读入前半部分语料 file_obj = FileObj(r"sentence1.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词 seg = Seg() # 生成模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 # 计算句子相似度 # for i in range(0,len(train_sentences)/100): # mysims = ss.mysimilarity(test1_sentences[i*100]) # # 每一百行为一个整体 # sims_divided = mysims[i*100:(i+1)*100] # # 对一百行内的相似度进行排序 # sort_sims = sorted(enumerate(sims_divided),key = lambda item : -item[1]) # # 选择前五个最高的相似度进行输出 # chosen_sims = sort_sims[:5] # for j in range(0,5):
train_sentences = rf.readlines() # 读入测试集 with open('dataset/test_input.txt','r',encoding='utf-8') as rf: raw_test_sentences = rf.readlines() test_sentences = [] for sen in raw_test_sentences: test_sentences.append(sen.strip()) for sen in test_sentences: print(sen) # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) if train: ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 ss.save_model() else: ss.restore_model() # 测试集 right_count = 0 print(os.getcwd()) file_result = open('dataset/test_output.txt', 'w',encoding='utf-8') with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer: line = file_answer.readlines()