def main(): trainset, validset, testset = [], [], [] if args.inference: # 测试时只载入测试集 with open(args.testset_path, 'r', encoding='utf8') as fr: for line in fr: testset.append(json.loads(line)) print(f'载入测试集{len(testset)}条') else: # 训练时载入训练集和验证集 with open(args.trainset_path, 'r', encoding='utf8') as fr: for line in fr: trainset.append(json.loads(line)) print(f'载入训练集{len(trainset)}条') with open(args.validset_path, 'r', encoding='utf8') as fr: for line in fr: validset.append(json.loads(line)) print(f'载入验证集{len(validset)}条') vocab, embeds = [], [] with open(args.embed_path, 'r', encoding='utf8') as fr: for line in fr: line = line.strip() word = line[:line.find(' ')] vec = line[line.find(' ') + 1:].split() embed = [float(v) for v in vec] assert len(embed) == config.embedding_size # 检测词向量维度 vocab.append(word) embeds.append(embed) print(f'载入词汇表: {len(vocab)}个') print(f'词向量维度: {config.embedding_size}') vads = [] with open(args.vad_path, 'r', encoding='utf8') as fr: for line in fr: line = line.strip() vad = line[line.find(' ') + 1:].split() vad = [float(item) for item in vad] assert len(vad) == config.affect_embedding_size vads.append(vad) print(f'载入vad字典: {len(vads)}个') print(f'vad维度: {config.affect_embedding_size}') # 通过词汇表构建一个word2index和index2word的工具 sentence_processor = SentenceProcessor(vocab, config.pad_id, config.start_id, config.end_id, config.unk_id) model = Model(config) model.print_parameters() # 输出模型参数个数 epoch = 0 # 训练集迭代次数 global_step = 0 # 参数更新次数 # 载入模型 if os.path.isfile(args.model_path): # 如果载入模型的位置存在则载入模型 epoch, global_step = model.load_model(args.model_path) model.affect_embedding.embedding.weight.requires_grad = False print('载入模型完成') log_dir = os.path.split(args.model_path)[0] elif args.inference: # 如果载入模型的位置不存在,但是又要测试,这是没有意义的 print('请测试一个训练过的模型!') return else: # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量 model.embedding.embedding.weight = torch.nn.Parameter( torch.FloatTensor(embeds)) model.affect_embedding.embedding.weight = torch.nn.Parameter( torch.FloatTensor(vads)) model.affect_embedding.embedding.weight.requires_grad = False print('初始化模型完成') log_dir = os.path.join(args.log_path, 'run' + str(int(time.time()))) if not os.path.exists(log_dir): os.makedirs(log_dir) if args.gpu: model.to('cuda') # 将模型参数转到gpu # 定义优化器参数 optim = Optim(config.method, config.lr, config.lr_decay, config.weight_decay, config.eps, config.max_grad_norm) optim.set_parameters(model.parameters()) # 给优化器设置参数 optim.update_lr(epoch) # 每个epoch更新学习率 # 训练 if not args.inference: summary_writer = SummaryWriter(os.path.join( log_dir, 'summary')) # 创建tensorboard记录的文件夹 dp_train = DataProcessor(trainset, config.batch_size, sentence_processor) # 数据的迭代器 dp_valid = DataProcessor(validset, config.batch_size, sentence_processor, shuffle=False) while epoch < args.max_epoch: # 最大训练轮数 model.train() # 切换到训练模式 for data in dp_train.get_batch_data(): start_time = time.time() feed_data = prepare_feed_data(data) rl_loss, reward, loss, nll_loss, kld_loss, kld_weight, ppl = \ train(model, feed_data, global_step) if args.reinforce: rl_loss.mean().backward() else: loss.mean().backward() # 反向传播 optim.step() # 更新参数 optim.optimizer.zero_grad() # 清空梯度 use_time = time.time() - start_time global_step += 1 # 参数更新次数+1 if global_step % args.print_per_step == 0: print( 'epoch: {:d}, global_step: {:d}, lr: {:g}, rl_loss: {:.2f}, reward: {:.2f}, nll_loss: {:.2f},' ' kld_loss: {:.2f}, kld_weight: {:g}, ppl: {:.2f}, time: {:.2f}s/step' .format(epoch, global_step, optim.lr, rl_loss.mean().item(), reward.mean().item(), nll_loss.mean().item(), kld_loss.mean().item(), kld_weight, ppl.mean().exp().item(), use_time)) summary_writer.add_scalar('train_rl', rl_loss.mean().item(), global_step) summary_writer.add_scalar('train_reward', reward.mean().item(), global_step) summary_writer.add_scalar('train_nll', nll_loss.mean().item(), global_step) summary_writer.add_scalar('train_kld', kld_loss.mean().item(), global_step) summary_writer.add_scalar('train_weight', kld_weight, global_step) summary_writer.add_scalar('train_ppl', ppl.mean().exp().item(), global_step) summary_writer.flush() # 将缓冲区写入文件 if global_step % args.log_per_step == 0 \ or (global_step % (2*config.kl_step) - config.kl_step) == config.kl_step // 2: log_file = os.path.join( log_dir, '{:03d}{:012d}.model'.format(epoch, global_step)) model.save_model(epoch, global_step, log_file) model.eval() reward, nll_loss, kld_loss, ppl = valid( model, dp_valid, global_step - 1) model.train() print( '在验证集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}' .format(reward, nll_loss, kld_loss, np.exp(ppl))) summary_writer.add_scalar('valid_reward', reward, global_step) summary_writer.add_scalar('valid_nll', nll_loss, global_step) summary_writer.add_scalar('valid_kld', kld_loss, global_step) summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step) summary_writer.flush() # 将缓冲区写入文件 epoch += 1 # 数据集迭代次数+1 optim.update_lr(epoch) # 调整学习率 log_file = os.path.join( log_dir, '{:03d}{:012d}.model'.format(epoch, global_step)) model.save_model(epoch, global_step, log_file) model.eval() reward, nll_loss, kld_loss, ppl = valid(model, dp_valid, global_step - 1) print('在验证集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}'. format(reward, nll_loss, kld_loss, np.exp(ppl))) summary_writer.add_scalar('valid_reward', reward, global_step) summary_writer.add_scalar('valid_nll', nll_loss, global_step) summary_writer.add_scalar('valid_kld', kld_loss, global_step) summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step) summary_writer.flush() # 将缓冲区写入文件 summary_writer.close() else: # 测试 if not os.path.exists(args.result_path): # 创建结果文件夹 os.makedirs(args.result_path) result_file = os.path.join(args.result_path, '{:03d}{:012d}.txt'.format( epoch, global_step)) # 命名结果文件 fw = open(result_file, 'w', encoding='utf8') dp_test = DataProcessor(testset, config.batch_size, sentence_processor, shuffle=False) model.eval() # 切换到测试模式,会停用dropout等等 reward, nll_loss, kld_loss, ppl = valid(model, dp_test, global_step - 1) print('在测试集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}'. format(reward, nll_loss, kld_loss, np.exp(ppl))) len_results = [] # 统计生成结果的总长度 for data in dp_test.get_batch_data(): posts = data['str_posts'] responses = data['str_responses'] feed_data = prepare_feed_data(data, inference=True) results = test(model, feed_data) # 使用模型计算结果 [batch, len_decoder] for idx, result in enumerate(results): new_data = dict() new_data['post'] = posts[idx] new_data['response'] = responses[idx] new_data['result'] = sentence_processor.index2word( result) # 将输出的句子转回单词的形式 len_results.append(len(new_data['result'])) fw.write(json.dumps(new_data, ensure_ascii=False) + '\n') fw.close() print(f'生成句子平均长度: {1.0 * sum(len_results) / len(len_results)}')
def main(): # 载入数据集 trainset, validset, testset = [], [], [] if args.inference: # 测试时只载入测试集 with open(args.testset_path, 'r', encoding='utf8') as fr: for line in fr: testset.append(json.loads(line)) print('载入测试集%d条' % len(testset)) else: # 训练时载入训练集和验证集 with open(args.trainset_path, 'r', encoding='utf8') as fr: for line in fr: trainset.append(json.loads(line)) print('载入训练集%d条' % len(trainset)) with open(args.validset_path, 'r', encoding='utf8') as fr: for line in fr: validset.append(json.loads(line)) print('载入验证集%d条' % len(validset)) # 载入词汇表,词向量 vocab, embeds = [], [] with open(args.embed_path, 'r', encoding='utf8') as fr: for line in fr: line = line.strip() word = line[:line.find(' ')] vec = line[line.find(' ') + 1:].split() embed = [float(v) for v in vec] assert len(embed) == config.embedding_size # 检测词向量维度 vocab.append(word) embeds.append(embed) print('载入词汇表: %d个' % len(vocab)) print('词向量维度: %d' % config.embedding_size) # 通过词汇表构建一个word2index和index2word的工具 sentence_processor = SentenceProcessor(vocab, config.pad_id, config.start_id, config.end_id, config.unk_id) # 创建模型 model = Model(config) epoch = 0 # 训练集迭代次数 global_step = 0 # 参数更新次数 # 载入模型 if os.path.isfile(args.model_path): # 如果载入模型的位置存在则载入模型 epoch, global_step = model.load_model(args.model_path) print('载入模型完成') # 记录模型的文件夹 log_dir = os.path.split(args.model_path)[0] elif args.inference: # 如果载入模型的位置不存在,但是又要测试,这是没有意义的 print('请测试一个训练过的模型!') return else: # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量 model.embedding.embedding.weight = torch.nn.Parameter( torch.FloatTensor(embeds)) print('初始化模型完成') # 记录模型的文件夹 log_dir = os.path.join(args.log_path, 'run' + str(int(time.time()))) if not os.path.exists(log_dir): os.makedirs(log_dir) if args.gpu: model.to('cuda') # 将模型参数转到gpu model.print_parameters() # 输出模型参数个数 # 定义优化器参数 optim = Optim(config.method, config.lr, config.lr_decay, config.weight_decay, config.max_grad_norm) optim.set_parameters(model.parameters()) # 给优化器设置参数 optim.update_lr(epoch) # 每个epoch更新学习率 # 训练 if not args.inference: summary_writer = SummaryWriter(os.path.join( log_dir, 'summary')) # 创建tensorboard记录的文件夹 dp_train = DataProcessor(trainset, config.batch_size, sentence_processor) # 数据的迭代器 dp_valid = DataProcessor(validset, config.batch_size, sentence_processor, shuffle=False) while epoch < args.max_epoch: # 最大训练轮数 model.train() # 切换到训练模式 for data in dp_train.get_batch_data(): start_time = time.time() feed_data = prepare_feed_data(data) nll_loss, ppl = train(model, feed_data) optim.optimizer.zero_grad() # 清空梯度 nll_loss.mean().backward() # 反向传播 optim.step() # 更新参数 use_time = time.time() - start_time global_step += 1 # 参数更新次数+1 # summary当前情况 if global_step % args.print_per_step == 0: print( 'epoch: %d, global_step: %d, lr: %g, nll_loss: %.2f, ppl: %.2f, time: %.2fs' % (epoch, global_step, optim.lr, nll_loss.mean().item(), ppl.mean().exp().item(), use_time)) summary_writer.add_scalar('train_nll', nll_loss.mean().item(), global_step) summary_writer.add_scalar('train_ppl', ppl.mean().exp().item(), global_step) summary_writer.flush() # 将缓冲区写入文件 if global_step % args.log_per_step == 0: # 保存模型 log_file = os.path.join( log_dir, '%03d%012d.model' % (epoch, global_step)) model.save_model(epoch, global_step, log_file) # 验证集上计算困惑度 model.eval() nll_loss, ppl = valid(model, dp_valid) model.train() print('在验证集上的nll损失为: %g, 困惑度为: %g' % (nll_loss, np.exp(ppl))) summary_writer.add_scalar('valid_nll', nll_loss, global_step) summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step) summary_writer.flush() # 将缓冲区写入文件 epoch += 1 # 数据集迭代次数+1 optim.update_lr(epoch) # 调整学习率 # 保存模型 log_file = os.path.join(log_dir, '%03d%012d.model' % (epoch, global_step)) model.save_model(epoch, global_step, log_file) # 验证集上计算困惑度 model.eval() nll_loss, ppl = valid(model, dp_valid) print('在验证集上的nll损失为: %g, 困惑度为: %g' % (nll_loss, np.exp(ppl))) summary_writer.add_scalar('valid_nll', nll_loss, global_step) summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step) summary_writer.flush() # 将缓冲区写入文件 summary_writer.close() else: # 测试 if not os.path.exists(args.result_path): # 创建结果文件夹 os.makedirs(args.result_path) result_file = os.path.join(args.result_path, '%03d%012d.txt' % (epoch, global_step)) # 命名结果文件 fw = open(result_file, 'w', encoding='utf8') dp_test = DataProcessor(testset, config.batch_size, sentence_processor, shuffle=False) model.eval() # 切换到测试模式,会停用dropout等等 nll_loss, ppl = valid(model, dp_test) # 评估困惑度 print('在测试集上的nll损失为: %g, 困惑度为: %g' % (nll_loss, np.exp(ppl))) len_results = [] # 统计生成结果的总长度 for data in dp_test.get_batch_data(): posts = data['str_posts'] responses = data['str_responses'] feed_data = prepare_feed_data(data, inference=True) results = test(model, feed_data) # 使用模型计算结果 [batch, len_decoder] for idx, result in enumerate(results): new_data = {} new_data['post'] = posts[idx] new_data['response'] = responses[idx] new_data['result'] = sentence_processor.index2word( result) # 将输出的句子转回单词的形式 len_results.append(len(new_data['result'])) fw.write(json.dumps(new_data) + '\n') fw.close() print('生成句子平均长度: %d' % (1.0 * sum(len_results) / len(len_results)))
def main(): trainset, validset, testset = [], [], [] if args.inference: # 测试时只载入测试集 with open(args.testset_path, 'r', encoding='utf8') as fr: for line in fr: testset.append(json.loads(line)) print(f'载入测试集{len(testset)}条') else: # 训练时载入训练集和验证集 with open(args.trainset_path, 'r', encoding='utf8') as fr: for line in fr: trainset.append(json.loads(line)) print(f'载入训练集{len(trainset)}条') with open(args.validset_path, 'r', encoding='utf8') as fr: for line in fr: validset.append(json.loads(line)) print(f'载入验证集{len(validset)}条') vocab, embeds = [], [] with open(args.embed_path, 'r', encoding='utf8') as fr: for line in fr: line = line.strip() word = line[:line.find(' ')] vec = line[line.find(' ') + 1:].split() embed = [float(v) for v in vec] assert len(embed) == config.embedding_size # 检测词向量维度 vocab.append(word) embeds.append(embed) print(f'载入词汇表: {len(vocab)}个') print(f'词向量维度: {config.embedding_size}') vads = [] with open(args.vad_path, 'r', encoding='utf8') as fr: for line in fr: line = line.strip() vad = line[line.find(' ') + 1:].split() vad = [float(item) for item in vad] assert len(vad) == config.affect_embedding_size vads.append(vad) print(f'载入vad字典: {len(vads)}个') print(f'vad维度: {config.affect_embedding_size}') # 通过词汇表构建一个word2index和index2word的工具 sentence_processor = SentenceProcessor(vocab, config.pad_id, config.start_id, config.end_id, config.unk_id) model = Model(config) model.print_parameters() # 输出模型参数个数 epoch = 0 # 训练集迭代次数 global_step = 0 # 参数更新次数 # 载入模型 if os.path.isfile(args.model_path): # 如果载入模型的位置存在则载入模型 epoch, global_step = model.load_model(args.model_path) model.affect_embedding.embedding.weight.requires_grad = False print('载入模型完成') log_dir = os.path.split(args.model_path)[0] elif args.inference: # 如果载入模型的位置不存在,但是又要测试,这是没有意义的 print('请测试一个训练过的模型!') return else: # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量 model.embedding.embedding.weight = torch.nn.Parameter( torch.FloatTensor(embeds)) model.affect_embedding.embedding.weight = torch.nn.Parameter( torch.tensor(vads).float()) model.affect_embedding.embedding.weight.requires_grad = False print('初始化模型完成') log_dir = os.path.join(args.log_path, 'run' + str(int(time.time()))) if not os.path.exists(log_dir): os.makedirs(log_dir) if args.gpu: model.to('cuda') # 将模型参数转到gpu # 定义优化器参数 optim = Optim(config.method, config.lr, config.lr_decay, config.weight_decay, config.max_grad_norm) optim.set_parameters(model.parameters()) # 给优化器设置参数 optim.update_lr(epoch) # 每个epoch更新学习率 # 训练 if not args.inference: dp_train = DataProcessor(trainset, config.batch_size, sentence_processor) # 数据的迭代器 dp_valid = DataProcessor(validset, config.batch_size, sentence_processor, shuffle=False) while epoch < args.max_epoch: # 最大训练轮数 model.train() # 切换到训练模式 for data in dp_train.get_batch_data(): start_time = time.time() feed_data = prepare_feed_data(data) nll_loss, precision = train(model, feed_data) nll_loss.mean().backward() # 反向传播 optim.step() # 更新参数 optim.optimizer.zero_grad() # 清空梯度 use_time = time.time() - start_time global_step += 1 # 参数更新次数+1 if global_step % args.print_per_step == 0: print( 'epoch: {:d}, global_step: {:d}, lr: {:g}, nll_loss: {:.2f}, precision: {:.2%},' ' time: {:.2f}s/step'.format(epoch, global_step, optim.lr, nll_loss.mean().item(), precision.mean().item(), use_time)) epoch += 1 # 数据集迭代次数+1 optim.update_lr(epoch) # 调整学习率 log_file = os.path.join( log_dir, '{:03d}{:012d}.model'.format(epoch, global_step)) model.save_model(epoch, global_step, log_file) model.eval() nll_loss, precision = valid(model, dp_valid) print('在验证集上的NLL损失为: {:g}, 准确率为: {:.2%}'.format( nll_loss, precision)) else: # 测试 if not os.path.exists(args.result_path): # 创建结果文件夹 os.makedirs(args.result_path) result_file = os.path.join(args.result_path, '{:03d}{:012d}.txt'.format( epoch, global_step)) # 命名结果文件 fw = open(result_file, 'w', encoding='utf8') dp_test = DataProcessor(testset, config.batch_size, sentence_processor, shuffle=False) model.eval() nll_loss, precision = valid(model, dp_test) print('在测试集上的NLL损失为: {:g}, 准确率为: {:.2%}'.format(nll_loss, precision)) for data in dp_test.get_batch_data(): texts = data['str_texts'] emotions = data['emotions'] feed_data = prepare_feed_data(data) results = test(model, feed_data) # 使用模型计算结果 [batch] for idx, result in enumerate(results): new_data = dict() new_data['text'] = texts[idx] new_data['emotion'] = emotions[idx] new_data['result'] = result # 将输出的句子转回单词的形式 fw.write(json.dumps(new_data, ensure_ascii=False) + '\n') fw.close()