def train(params): # GPU资源配置 config_gpu(use_cpu=False, gpu_memory=params['gpu_memory']) # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["max_vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") # model = Seq2Seq(params) model = PGN(params) print("Creating the batcher ...") dataset = batcher(vocab, params) # print('dataset is ', dataset) # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("Starting the training ...") train_model(model, dataset, params, checkpoint_manager)
def train(params): # GPU资源配置 config_gpu() # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) # 构建模型 print("Building the model ...") # model = Seq2Seq(params) model = PGN(params) print("Creating the batcher ...") # dataset = batcher(params["train_seg_x_dir"], params["train_seg_y_dir"], vocab, params) # print('dataset is ', dataset) # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("Starting the training ...") train_model(model, vocab, params, checkpoint_manager)
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" if params['decode_mode'] == 'beam': assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu(use_cpu=True) print("Building the model ...") model = PGN(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") results = predict_result(model, params, vocab, params['result_save_path']) print('save result to :{}'.format(params['result_save_path'])) print('save result :{}'.format(results[:5]))
def test(params): assert params["mode"].lower() in ["test", "eval"], "change training mode to 'test' or 'eval'" assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu(use_cpu=True) print("Building the model ...") model = Seq2Seq(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") if params['greedy_decode']: predict_result(model, params, vocab, params['result_save_path']) else: b = bream_test_batch_generator(params["beam_size"]) results = [] for batch in b: best_hyp = beam_decode(model, batch, vocab, params) results.append(best_hyp.abstract) save_predict_result(results, params['result_save_path']) print('save result to :{}'.format(params['result_save_path']))
def test(params): assert params["mode"].lower( ) == "test", "change training mode to 'test' or 'eval'" assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" print("Building the model ...") model = PGN(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) print("Creating the batcher ...") b = batcher(vocab, params) print("Creating the checkpoint manager") print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") for batch in b: print(beam_decode(model, batch, vocab, params))
def test(params): assert params["mode"].lower() in ["test", "eval"], "change training mode to 'test' or 'eval'" assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu(use_cpu=True) print("Building the model ...") model = PGN(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the batcher ...") b = batcher(vocab, params) print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") if params['mode'] == 'eval' or params['mode'] == 'test': for batch in b: #print("batch is:\n", batch) yield beam_decode(model, batch, vocab, params) else: for batch in b: print(beam_decode(model, batch, vocab, params))
def save_example_dataset(params): # vocab 对象 vocab = Vocab(vocab_path) # 获取batch data dataset = batcher(vocab, params) # 批量保存一轮数据 step = 0 for batch_data in dataset: # 存储路径 pickle_path = os.path.join(params['train_pickle_dir'], str(step) + '.pickle') # 保存处理好的pickle文件 save_pickle(batch_data, pickle_path) step += 1 if step > params['max_train_steps']: break
def train(params): # GPU资源配置 config_gpu() # 读取vocab训练 vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") model = Seq2Seq(params) # 获取保存管理者 checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) # 训练模型 train_model(model, vocab, params, checkpoint_manager)
def train(params): # GPU资源配置 config_gpu() # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["max_vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") # model = Seq2Seq(params) model = PGN(params) print("Creating the batcher ...") dataset, params['steps_per_epoch'] = batcher(vocab, params) # print('dataset is ', dataset) # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) params["trained_epoch"] = int(checkpoint_manager.latest_checkpoint[-1]) else: print("Initializing from scratch.") params["trained_epoch"] = 1 # 学习率衰减 params["learning_rate"] *= np.power(0.95, params["trained_epoch"]) print('learning_rate:{}'.format(params["learning_rate"])) # 训练模型 print("Starting the training ...") train_model(model, dataset, params, checkpoint_manager)
def train(params): # GPU资源配置 # config_gpu(use_cpu=False, gpu_memory=params['gpu_memory']) gpus = tf.config.experimental.list_physical_devices(device_type='GPU') if gpus: tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU') tf.config.experimental.set_memory_growth(gpus[0], enable=True) # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["max_vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") # model = Seq2Seq(params) model = PGN(params) print("Creating the batcher ...") dataset = batcher(vocab, params) # print('dataset is ', dataset) # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("Starting the training ...") train_model(model, dataset, params, checkpoint_manager)
"dec_target": entry["target"], "dec_len": entry["dec_len"], "abstract": entry["abstract"], "sample_decoder_pad_mask": entry["sample_decoder_pad_mask"] }) dataset = dataset.map(update) return dataset def batcher(vocab, params): if params['mode'] == 'train' and params['load_batch_train_data']: dataset = load_batch_generator(params) else: dataset = batch_generator(example_generator, params, vocab, params["max_enc_len"], params["max_dec_len"], params["batch_size"], params["mode"]) return dataset if __name__ == '__main__': # 获取参数 params = get_params() params['mode'] = 'test' # vocab 对象 vocab = Vocab(vocab_path) # 获取batch data dataset = batcher(vocab, params) next(iter(dataset))
# output shape == (batch_size * 1, hidden_size) output = tf.reshape(output, (-1, output.shape[2])) # output shape == (batch_size, vocab) prediction = self.fc(output) return prediction, state if __name__ == '__main__': # GPU资源配置 config_gpu() # 获得参数 params = get_params() # 读取vocab训练 vocab = Vocab(params["vocab_path"], params["vocab_size"]) # 计算vocab size vocab_size = vocab.count # 使用GenSim训练好的embedding matrix embedding_matrix = load_word2vec_file(save_wv_model_path) input_sequence_len = 250 BATCH_SIZE = 64 embedding_dim = 500 units = 1024 # 编码器结构 encoder = Encoder(vocab_size, embedding_dim, embedding_matrix, units, BATCH_SIZE) # example_input example_input_batch = tf.ones(shape=(BATCH_SIZE, input_sequence_len),
def build_dataset(train_data_path, test_data_path): ''' 数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 ''' # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值剔除 train_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.多线程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format( len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) train_df.to_csv(train_seg_path, index=None, header=False) test_df.to_csv(test_seg_path, index=None, header=False) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=False) # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, sg=1, workers=cores, iter=wv_train_epochs, window=5, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 训练集 验证集划分 X_train, X_val, y_train, y_val = train_test_split( train_df['X'], train_df['Report'], test_size=0.002, # 8W*0.002 ) X_train.to_csv(train_x_seg_path, index=None, header=False) y_train.to_csv(train_y_seg_path, index=None, header=False) X_val.to_csv(val_x_seg_path, index=None, header=False) y_val.to_csv(val_y_seg_path, index=None, header=False) test_df['X'].to_csv(test_x_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) # # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) # 12. 更新vocab vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)} reverse_vocab = { index: word for index, word in enumerate(wv_model.wv.index2word) } # 保存字典 save_dict(vocab_path, vocab) save_dict(reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231 vocab = Vocab() train_ids_x = train_df['X'].apply( lambda x: transform_data(x, vocab.word2id)) train_ids_y = train_df['Y'].apply( lambda x: transform_data(x, vocab.word2id)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab.word2id)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800, 403, 986 ]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) # 保存数据 np.save(train_x_path, train_X) np.save(train_y_path, train_Y) np.save(test_x_path, test_X) return train_X, train_Y, test_X
# the final distribution for that decoder timestep # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore. final_dists = [ vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected) ] return final_dists if __name__ == '__main__': # GPU资源配置 config_gpu() # 读取vocab训练 vocab = Vocab(save_vocab_path) # 计算vocab size vocab_size = vocab.count # 使用GenSim训练好的embedding matrix embedding_matrix = load_embedding_matrix() params = defaultdict() params["vocab_size"] = vocab_size params["embed_size"] = 300 params["enc_units"] = 512 params["attn_units"] = 512 params["dec_units"] = 512 params["batch_size"] = 64 params["max_enc_len"] = 200 params["max_dec_len"] = 41
# the final distribution for that decoder timestep # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore. final_dists = [ vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected) ] return final_dists if __name__ == '__main__': # GPU资源配置 config_gpu() # 读取vocab训练 vocab, reverse_vocab = Vocab.load_vocab(save_wv_model_path) # 计算vocab size vocab_size = len(vocab) batch_size = 128 input_sequence_len = 200 params = {} params["vocab_size"] = vocab_size params["embed_size"] = 500 params["enc_units"] = 512 params["attn_units"] = 512 params["dec_units"] = 512 params["batch_size"] = batch_size model = Seq2Seq(params)
def generate_dataset_cache(train_df, test_df, wv_model): # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) train_df['X'].to_csv(config.train_x_seg_path, index=None, header=False) train_df['Report'].to_csv(config.train_y_seg_path, index=None, header=False) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'].to_csv(config.val_x_seg_path, index=None, header=False) test_df['Report'].to_csv(config.val_y_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.key_to_index # The 'vocab' attribute was removed from KeyedVector in Gensim 4.0.0. # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) test_y_max_len = get_max_len(test_df['Report']) test_df['Y'] = test_df['Report'].apply( lambda x: pad_proc(x, test_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(config.train_x_pad_path, index=False, header=False) train_df['Y'].to_csv(config.train_y_pad_path, index=False, header=False) test_df['X'].to_csv(config.test_x_pad_path, index=False, header=False) test_df['Y'].to_csv(config.test_y_pad_path, index=False, header=False) # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 if not os.path.exists(os.path.dirname(config.save_wv_model_path)): os.makedirs(os.path.dirname(config.save_wv_model_path)) wv_model.save(config.save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.key_to_index)) # 12. 更新vocab # The 'index2word' attribute has been replaced by 'index_to_key' since Gensim 4.0.0. vocab = { word: index for index, word in enumerate(wv_model.wv.index_to_key) } reverse_vocab = { index: word for index, word in enumerate(wv_model.wv.index_to_key) } # 保存字典 save_dict(config.vocab_path, vocab) save_dict(config.reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(config.embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [2, 403, 986, 246, 231 vocab = Vocab() train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab)) train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab)) test_ids_y = test_df['Y'].apply(lambda x: transform_data(x, vocab)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [2, 403, 986, 246, 231] --> array([[2, 403, 986 , 246, 231]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) test_Y = np.array(test_ids_y.tolist()) # 保存数据 np.save(config.train_x_path, train_X) np.save(config.train_y_path, train_Y) np.save(config.test_x_path, test_X) np.save(config.test_y_path, test_Y) return test_X, test_Y, train_X, train_Y