def create_matrix_org_np(sen_count, word2vec_model, qa_path,
                         matrix_ques_path_word):
    """
      创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
    :param sen_count: int, write sentence_encode num per twice
    :param word2vec_model: model
    :param qa_path: str
    :param matrix_ques_path: str
    :return: 
    """
    if os.path.exists(matrix_ques_path_word):
        file_matrix_ques = open(matrix_ques_path_word, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        # questions.append(ques)
        word_list, flag_list = word_flag_cut(ques)
        sentence_vec = encoding_basic_question(word2vec_model, word_list,
                                               flag_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_word/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            # break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt",
        matrix_ques)
    # matrix_ques = []
    # file_matrix_ques = open(matrix_ques_path, 'wb')
    # pickle.dump(matrix_ques, file_matrix_ques)
    print('create_matrix_org_np ok!')
Example #2
0
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
    """
      创建问题句向量
    :param sen_count: int
    :param word2vec_model: gensim model
    :param qa_path: str
    :param matrix_ques_path:str 
    :return: None
    """
    if os.path.exists(matrix_ques_path):
        file_matrix_ques = open(matrix_ques_path, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        char_list = [ques_char for ques_char in ques]
        sentence_vec = question_encoding(word2vec_model, char_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_char/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt",
        matrix_ques)

    print('create_matrix_org_pkl ok!')
            # break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt",
        matrix_ques)
    # matrix_ques = []
    # file_matrix_ques = open(matrix_ques_path, 'wb')
    # pickle.dump(matrix_ques, file_matrix_ques)
    print('create_matrix_org_np ok!')
    # return matrix_ques


if __name__ == '__main__':
    # 读取问答语料
    syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')

    # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载
    if os.path.exists(w2v_model_wiki_word_path):
        word2vec_model = load_word2vec_model(w2v_model_wiki_word_path,
                                             limit=None)
        print("load w2v_model_wiki_word_path ok!")
    else:
        word2vec_model = load_word2vec_model(w2v_model_merge_short_path,
                                             limit=None)
        print("load w2v_model_merge_short_path ok!")

    # 创建标准问答中问题的句向量,存起来,到matrix_ques_path
    if not os.path.exists(matrix_ques_part_path):
        create_matrix_org_np(sen_count=100000,
                             word2vec_model=word2vec_model,