def test(params): assert params["mode"].lower() == "test", "change training mode to 'test' or 'eval'" # assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params" vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the batcher ...") dataset, params['steps_per_epoch'] = batcher(vocab, params) print("Building the model ...") model = PGN_TRANSFORMER(params) print("Creating the checkpoint manager") ckpt = tf.train.Checkpoint(PGN_TRANSFORMER=model) ckpt_manager = tf.train.CheckpointManager(ckpt, params['transformer_model_dir'], max_to_keep=5) # path = params["model_path"] if params["model_path"] else ckpt_manager.latest_checkpoint # path = ckpt_manager.latest_checkpoint ckpt.restore(ckpt_manager.latest_checkpoint) print("Model restored") for batch in dataset: if params['decode_mode'] == "greedy": yield greedy_decode(model, dataset, vocab, params) else: yield beam_decode(model, batch, vocab, params, params['print_info'])
def train(params): # GPU资源配置 #config_gpu(use_cpu=True, gpu_memory=params['gpu_memory']) # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") # model = Seq2Seq(params) model = PGN(params) print("Creating the batcher ...") dataset = batcher(vocab, params) # print('dataset is ', dataset) # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("Starting the training ...") train_model(model, dataset, params, checkpoint_manager)
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" if params['decode_mode'] == 'beam': assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu() print("Building the model ...") model = PGN(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, pgn_checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") results = predict_result(model, params, vocab, params['result_save_path'])
def train(params): assert params["mode"].lower() == "train", "change training mode to 'train'" print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Creating the batcher ...") batch, params['steps_per_epoch'] = batcher(vocab, params) print("Building the model ...") model = PGN_TRANSFORMER(params) print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN_TRANSFORMER=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['transformer_model_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) params["trained_epoch"] = int(checkpoint_manager.latest_checkpoint[-1]) else: print("Initializing from scratch.") params["trained_epoch"] = 1 print("Starting the training ...") train_model(model, batch, params, checkpoint_manager)
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 # config_gpu() device = torch.device('cuda:0' if torch.cuda.is_available() and params['device'] == 'cuda' else 'cpu') params['device'] = device print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Building the model ...") model = Seq2Seq(params, vocab).to(device) # print(model.state_dict()) print("Creating the checkpoint manager") path_checkpoint = Path(params['checkpoint_dir']).joinpath('epoch10.bin') checkpoint = torch.load(path_checkpoint) # model.load_state_dict(checkpoint['model_state_dict']) # checkpoint = tf.train.Checkpoint(Seq2Seq=model) # checkpoint_manager = tf.train.CheckpointManager(checkpoint, seq2seq_checkpoint_dir, max_to_keep=5) # checkpoint.restore(checkpoint_manager.latest_checkpoint) # checkpoint.restore('../../data/checkpoints/training_checkpoints_seq2seq/ckpt-6') # if checkpoint_manager.latest_checkpoint: # print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) # else: # print("Initializing from scratch.") # print(model.state_dict()) print("Model restored") if params['greedy_decode']: print('Using greedy search to decoding ...') predict_result(model, params, vocab) else: print('Using beam search to decoding ...') b = Beam_Test_Dataloader(test_x_path, test_y_path, params["beam_size"]) # b = beam_test_batch_generator(params["beam_size"]) results = [] for batch in b.loader: batch = batch.to(params['device']) best_hyp = beam_decode(model, batch, vocab, params) results.append(best_hyp.abstract) get_rouge(results) print('save result to :{}'.format(params['result_save_path']))
def train(params): # GPU资源配置 config_gpu() # 读取vocab训练 vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") model = Seq2Seq(params, vocab) # 获取保存管理者 checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) # 训练模型 train_model(model, vocab, params, checkpoint_manager)
def train(params): # GPU资源配置 # config_gpu() device = torch.device('cuda:0' if torch.cuda.is_available() and params['device'] == 'cuda' else 'cpu') # 读取vocab训练 vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") model = Seq2Seq(params, vocab).to(device) # 获取保存管理者 # checkpoint = tf.train.Checkpoint(Seq2Seq=model) # checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) # 训练模型 train_model(model, vocab, params, device)
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu() print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count print("Building the model ...") model = Seq2Seq(params, vocab) print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, seq2seq_checkpoint_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) # checkpoint.restore('../../data/checkpoints/training_checkpoints_seq2seq/ckpt-6') if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") if params['greedy_decode']: print('Using greedy search to decoding ...') predict_result(model, params, vocab) else: print('Using beam search to decoding ...') b = beam_test_batch_generator(params["beam_size"]) results = [] for batch in b: best_hyp = beam_decode(model, batch, vocab, params) results.append(best_hyp.abstract) get_rouge(results) print('save result to :{}'.format(params['result_save_path']))
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" # GPU资源配置 config_gpu(use_cpu=True) print("Building the model ...") model = Seq2Seq(params) print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) #获取最后一次保存的模型 checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") print("Model restored") if params['greedy_decode']: # 贪心算法预测 predict_result(model, params, vocab, params['result_save_path']) else: #beam search预测 b = beam_test_batch_generator(params["beam_size"]) results = [] for batch in b: best_hyp = beam_decode(model, batch, vocab, params) results.append(best_hyp.abstract) save_predict_result(results, params['result_save_path']) print('save result to :{}'.format(params['result_save_path']))
def train(params): # GPU资源配置 config_gpu() # 读取vocab训练 print("Building the model ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # 构建模型 print("Building the model ...") model = PGN(params) print("Creating the batcher ...") train_dataset, params['train_steps_per_epoch'] = batcher(vocab, params) params["mode"] = 'val' val_dataset, params['val_steps_per_epoch'] = batcher(vocab, params) params["mode"] = 'train' # 获取保存管理者 print("Creating the checkpoint manager") checkpoint = tf.train.Checkpoint(PGN=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) params["trained_epoch"] = int(checkpoint_manager.latest_checkpoint[-1]) else: print("Initializing from scratch.") params["trained_epoch"] = 1 # 学习率衰减 params["learning_rate"] *= np.power(0.95, params["trained_epoch"]) print('learning_rate:{}'.format(params["learning_rate"])) # 训练模型 print("Starting the training ...") train_model(model, train_dataset, val_dataset, params, checkpoint_manager)
predictions.append(pred) # return shape == (batch_size, sen_len-1, vocab) return torch.stack(predictions, 1), dec_hidden if __name__ == '__main__': # GPU资源配置, pytorch 不用配置 # config_gpu() # 获得参数 params = get_params() # 设置训练设备 device = torch.device('cuda:0' if torch.cuda.is_available() and params['device'] == 'cuda' else 'cpu') # 读取vocab训练 vocab = Vocab(params["vocab_path"], params["vocab_size"]) # 计算vocab size input_sequence_len = 200 params = { "vocab_size": vocab.count, # "embed_size": 500, "enc_units": 512, "attn_units": 512, "dec_units": 512, "batch_size": 128, "input_sequence_len": input_sequence_len } model = Seq2Seq(params, vocab).to(device)
def generate_dataset_cache(train_df, test_df, wv_model): # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) train_df['X'].to_csv(config.train_x_seg_path, index=None, header=False) train_df['Report'].to_csv(config.train_y_seg_path, index=None, header=False) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'].to_csv(config.val_x_seg_path, index=None, header=False) test_df['Report'].to_csv(config.val_y_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.key_to_index # The 'vocab' attribute was removed from KeyedVector in Gensim 4.0.0. # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) test_y_max_len = get_max_len(test_df['Report']) test_df['Y'] = test_df['Report'].apply( lambda x: pad_proc(x, test_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(config.train_x_pad_path, index=False, header=False) train_df['Y'].to_csv(config.train_y_pad_path, index=False, header=False) test_df['X'].to_csv(config.test_x_pad_path, index=False, header=False) test_df['Y'].to_csv(config.test_y_pad_path, index=False, header=False) # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 if not os.path.exists(os.path.dirname(config.save_wv_model_path)): os.makedirs(os.path.dirname(config.save_wv_model_path)) wv_model.save(config.save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.key_to_index)) # 12. 更新vocab # The 'index2word' attribute has been replaced by 'index_to_key' since Gensim 4.0.0. vocab = { word: index for index, word in enumerate(wv_model.wv.index_to_key) } reverse_vocab = { index: word for index, word in enumerate(wv_model.wv.index_to_key) } # 保存字典 save_dict(config.vocab_path, vocab) save_dict(config.reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(config.embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [2, 403, 986, 246, 231 vocab = Vocab() train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab)) train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab)) test_ids_y = test_df['Y'].apply(lambda x: transform_data(x, vocab)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [2, 403, 986, 246, 231] --> array([[2, 403, 986 , 246, 231]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) test_Y = np.array(test_ids_y.tolist()) # 保存数据 np.save(config.train_x_path, train_X) np.save(config.train_y_path, train_Y) np.save(config.test_x_path, test_X) np.save(config.test_y_path, test_Y) return test_X, test_Y, train_X, train_Y
final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected)] return final_dists if __name__ == '__main__': # GPU资源配置 config_gpu() # 读取vocab训练 vocab = Vocab(vocab_path) # 计算vocab size vocab_size = vocab.count # 使用GenSim训练好的embedding matrix embedding_matrix = load_embedding_matrix() params = defaultdict() params["vocab_size"] = vocab_size params["embed_size"] = 300 params["enc_units"] = 512 params["attn_units"] = 20 params["dec_units"] = 1024 params["batch_size"] = 64 params["max_enc_len"] = 200 params["max_dec_len"] = 41 params["max_vocab_size"] = 32000
def build_dataset(train_data_path, test_data_path): """数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 """ # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format(len(train_df), len(test_df))) # 2. 空值剔除 train_df.dropna(subset=['Report'], inplace=True) test_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.多线程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) train_df.to_csv(config.train_seg_path, index=False, header=False) test_df.to_csv(config.test_seg_path, index=False, header=False) # 6. 保存合并数据 merged_df.to_csv(config.merger_seg_path, index=False, header=False) # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(config.merger_seg_path), size=config.embedding_dim, sg=1, workers=cores, iter=config.wv_train_epochs, window=5, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab)) test_y_max_len = get_max_len(test_df['Report']) test_df['Y'] = test_df['Report'].apply(lambda x: pad_proc(x, test_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(config.train_x_pad_path, index=False, header=False) train_df['Y'].to_csv(config.train_y_pad_path, index=False, header=False) test_df['X'].to_csv(config.test_x_pad_path, index=False, header=False) test_df['Y'].to_csv(config.test_y_pad_path, index=False, header=False) # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 if not os.path.exists(os.path.dirname(config.save_wv_model_path)): os.makedirs(os.path.dirname(config.save_wv_model_path)) wv_model.save(config.save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) # 12. 更新vocab vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)} reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)} # 保存字典 save_dict(config.vocab_path, vocab) save_dict(config.reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(config.embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [2, 403, 986, 246, 231 vocab = Vocab() train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab)) train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab)) test_ids_y = test_df['Y'].apply(lambda x: transform_data(x, vocab)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [2, 403, 986, 246, 231] --> array([[2, 403, 986 , 246, 231]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) test_Y = np.array(test_ids_y.tolist()) # 保存数据 np.save(config.train_x_path, train_X) np.save(config.train_y_path, train_Y) np.save(config.test_x_path, test_X) np.save(config.test_y_path, test_Y) return train_X, train_Y, test_X, test_Y
enc_output) dec_input = torch.unsqueeze(dec_target[:, t], 1) predictions.append(pred) return torch.stack(predictions, 1), dec_hidden if __name__ == '__main__': # GPU资源配置 # config_gpu() # 获得参数 params = get_params() # 读取vocab训练 vocab = Vocab(params['vocab_path'], params['vocab_size']) # 计算vocab size vocab_size = vocab.count input_seq_len = 200 params = { 'vocab_size': vocab_size, 'embed_size': 500, 'hidden_dim': 512, 'batch_size': 128, 'input_sequence_len': input_seq_len } model = Seq2Seq(params=params, vocab=vocab)