def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word): """ 创建问题句向量,设置sen_count=10000, 防止内存不够奔溃 :param sen_count: int, write sentence_encode num per twice :param word2vec_model: model :param qa_path: str :param matrix_ques_path: str :return: """ if os.path.exists(matrix_ques_path_word): file_matrix_ques = open(matrix_ques_path_word, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) # questions.append(ques) word_list, flag_list = word_segment_process(ques) sentence_vec = encoding_question(word2vec_model, word_list, flag_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) matrix_ques = [] # break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) # matrix_ques = [] # file_matrix_ques = open(matrix_ques_path, 'wb') # pickle.dump(matrix_ques, file_matrix_ques) print('create_matrix_org_np ok!')
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path): """ 创建问题句向量 :param sen_count: int :param word2vec_model: gensim model :param qa_path: str :param matrix_ques_path:str :return: None """ if os.path.exists(matrix_ques_path): file_matrix_ques = open(matrix_ques_path, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) char_list = [ques_char for ques_char in ques] sentence_vec = question_encoding(word2vec_model, char_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) matrix_ques = [] break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) print('create_matrix_org_pkl ok!')
# 创建标准问答中问题的句向量,存起来,到matrix_ques_path if not os.path.exists(matrix_ques_part_path): create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path) # 读取 print("np.loadtxt(matrix_ques_part_path) start!") matrix_ques = np.loadtxt(matrix_ques_part_path) print("np.loadtxt(matrix_ques_part_path) end!") while True: print("你: ") ques_ask = input() ques_clean = getChinese(ques_ask) word_list, flag_list = word_segment_process(ques_clean) sentence_vic = encoding_question(word2vec_model, word_list, flag_list) top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20) try: print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) except Exception as e: # 有的字符可能打不出来 print(str(e))
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) # 读取标准问句矩阵 print("np.loadtxt(matrix_ques_part_path) start!") matrix_ques = np.loadtxt(matrix_ques_part_path_char) print("np.loadtxt(matrix_ques_part_path) end!") # 标准问句矩阵初始化和预处理 matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init( matrix_ques, top_vec=20) ### 测试一个例子 ques_clean = getChinese("小姜机器人是谁呀") char_list = [ques_char for ques_char in ques_clean] sentence_vec = question_encoding(word2vec_model, char_list) top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec) try: print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) except Exception as e: # 有的字符可能打不出来 print(str(e))