def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word): """ 创建问题句向量,设置sen_count=10000, 防止内存不够奔溃 :param sen_count: int, write sentence_encode num per twice :param word2vec_model: model :param qa_path: str :param matrix_ques_path: str :return: """ if os.path.exists(matrix_ques_path_word): file_matrix_ques = open(matrix_ques_path_word, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) # questions.append(ques) word_list, flag_list = word_flag_cut(ques) sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) matrix_ques = [] # break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) # matrix_ques = [] # file_matrix_ques = open(matrix_ques_path, 'wb') # pickle.dump(matrix_ques, file_matrix_ques) print('create_matrix_org_np ok!')
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path): """ 创建问题句向量 :param sen_count: int :param word2vec_model: gensim model :param qa_path: str :param matrix_ques_path:str :return: None """ if os.path.exists(matrix_ques_path): file_matrix_ques = open(matrix_ques_path, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) char_list = [ques_char for ques_char in ques] sentence_vec = question_encoding(word2vec_model, char_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) matrix_ques = [] break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) print('create_matrix_org_pkl ok!')
# break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) # matrix_ques = [] # file_matrix_ques = open(matrix_ques_path, 'wb') # pickle.dump(matrix_ques, file_matrix_ques) print('create_matrix_org_np ok!') # return matrix_ques if __name__ == '__main__': # 读取问答语料 syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8') # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载 if os.path.exists(w2v_model_wiki_word_path): word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None) print("load w2v_model_wiki_word_path ok!") else: word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None) print("load w2v_model_merge_short_path ok!") # 创建标准问答中问题的句向量,存起来,到matrix_ques_path if not os.path.exists(matrix_ques_part_path): create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model,