def generate_training_data(data_train_file, output_file, word2idx): ''' 生成tokenize后的数据 Args: data_train_file:训练集文件 output_file:输出的tokenize后的文件 ''' data_train = pd.read_pickle(data_train_file) x_train_txt0 = data_train.txt_split.apply(split_word) X_train_txt, _ = make_deepLearn_data(x_train_txt0, word2idx) x_train_title0 = data_train.title_split.apply(split_word) X_train_title, _ = make_deepLearn_data(x_train_title0, word2idx) x_entity = data_train.entity.apply(split_word_entity).apply( generate_token, args=(word2idx, )) y_train = data_train.negative.values train_data = dict( zip(['txt', 'title', 'entity', 'y_train'], [X_train_txt, X_train_title, x_entity.values, y_train])) with open(output_file, 'wb') as f: pickle.dump(train_data, f) shape_dic = { 'txt_shape': X_train_txt.shape[1], 'title_shape': X_train_title.shape[1], } return shape_dic
def generate_test_data(data_test_file, output_file, shape_dic, word2idx): ''' 生成tokenize后的数据 Args: data_test_file:test set文件 output_file:输出的tokenize后的文件 shape_dic:由generate_training_data生成的txt,title的shape ''' data_test = pd.read_pickle(data_test_file) x_test_txt0 = data_test.txt_split.apply(split_word) X_test_txt, _ = make_deepLearn_data(x_test_txt0, word2idx) x_test_title0 = data_test.title_split.apply(split_word) X_test_title, _ = make_deepLearn_data(x_test_title0, word2idx) x_entity = data_test.entity.apply(split_word_entity).apply( generate_token, args=(word2idx, )) # 保证test set的padding长度 和train set一致 if shape_dic['txt_shape'] > X_test_txt.shape[1]: X_test_txt = pad_sequences(X_test_txt, shape_dic['txt_shape'], padding='post') else: X_test_txt = X_test_txt[:, :shape_dic['txt_shape']] if shape_dic['title_shape'] > X_test_title.shape[1]: X_test_title = pad_sequences(X_test_title, shape_dic['title_shape'], padding='post') else: X_test_title = X_test_title[:, :shape_dic['title_shape']] ## ouput file test_data = dict( zip(['txt', 'title', 'entity'], [X_test_txt, X_test_title, x_entity.values])) with open(output_file, 'wb') as f: pickle.dump(test_data, f)
# 读取所有句子,包括测试集和训练集 with open('all_word_seg.txt', 'r', encoding='UTF-8') as f: sentences = f.readlines() sentences = [item[:-1].split(' ') for item in sentences] ## 清除低频的entity entities_all_count = generate_count(entities_all, 'all_word_seg.txt') entities_all_count_keys = clear_entity(entities_all_count, limit_count=3) ## 从1开始,留一位给UNK,从1开始标 word2idx = dict( zip(entities_all_count_keys, range(1, len(entities_all_count_keys) + 1))) # 对句子进行tokenize,生成token矩阵 sentences_tokens, _ = make_deepLearn_data(sentences, word2idx) # 生成共现矩阵 cooccurrence_matrix = generate_co_occurrence( sentences_tokens, len(word2idx.keys()), window_size=5, ) # 训练glove并输出文件 glove_model = GloVe(n=dim_n, max_iter=itter_n) embedMatrix = glove_model.fit(cooccurrence_matrix) print('load word2vec model...') model = KeyedVectors.load_word2vec_format('train_vec_byTencent_word.bin', binary=True) print('build embedding matrix...')