def __init__(self, input_vocab_num, max_seq_len, pad_idx=0): """ :param input_vocab_num: 全部输入序列的词典的单词数 :param max_seq_len: 输入序列最大长度 :param pad_idx: pad的填充位置,默认为0 """ super(Encoder, self).__init__() self.word_embedding = nn.Embedding(input_vocab_num, config.d_model, padding_idx=pad_idx) # 词向量层 N*D self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx) # 位置向量层 (N+1)*D self.encoder_layers = nn.ModuleList([EncoderLayer() for _ in range(config.layers)]) # 堆叠n层encoder_layer self.pad_obj = Mask() # mask对象 self.tool = Tools() # 工具对象
def __init__(self, target_vocab_num, max_seq_len, pad_idx=0): """ :param target_vocab_num: 全部目标序列的单词数 :param max_seq_len: 全部目标序列的最大长度 :param pad_idx: 屏蔽位,默认为0 """ super(Decoder, self).__init__() self.word_embedding = nn.Embedding(target_vocab_num, config.d_model) # 构建词向量层 M*D self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx) # 构建位置向量层 (M+1)*D self.decoder_layers = nn.ModuleList([DecoderLayer() for _ in range(config.layers)]) # 堆叠n层DecoderLayer self.mask_obj = Mask() # mask类 self.tool = Tools() # 工具类
def __init__(self, transformer, optimizer, criterion): self.transformer = transformer # transformer模型 self.optimizer = optimizer # 优化器 self.criterion = criterion # 损失函数 self.tool = Tools() # 工具类 self.val_acc_all = [0] # 收集验证集所有的准确率 self.save_checkpoint = 0 # 模型最佳保存点的次数 self.val_all_batch = 0 self.train_all_batch = 0 self.train_loss = [] self.val_loss = [] self.train_ppl = [] self.val_ppl = [] self.train_acc = [] self.val_acc = []
class Decoder(nn.Module): """ Decoder由6层DecoderLayer构成 """ def __init__(self, target_vocab_num, max_seq_len, pad_idx=0): """ :param target_vocab_num: 全部目标序列的单词数 :param max_seq_len: 全部目标序列的最大长度 :param pad_idx: 屏蔽位,默认为0 """ super(Decoder, self).__init__() self.word_embedding = nn.Embedding(target_vocab_num, config.d_model) # 构建词向量层 M*D self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx) # 构建位置向量层 (M+1)*D self.decoder_layers = nn.ModuleList( [DecoderLayer() for _ in range(config.layers)]) # 堆叠n层DecoderLayer self.mask_obj = Mask() # mask类 self.tool = Tools() # 工具类 def forward(self, tgt_seq, src_seq, enc_out): """ :param tgt_seq: 该批目标序列 B*L :param src_seq: 该批输入序列 B*L :param enc_out: 编码器的输出 B*L*(d*h=d_model) :return: 解码器的输出 B*L*(d*h) """ tgt_pos = self.tool.seq_2_pos(tgt_seq) # 生成目标序列的位置向量 B*L no_pad_mask = self.mask_obj.no_padding_mask( tgt_seq) # 生成target序列补齐位的屏蔽位 B*L*1 pad_mask = self.mask_obj.padding_mask(tgt_seq, tgt_seq) # 生成序列的补齐位的屏蔽位 B*L*L seq_mask = self.mask_obj.sequence_mask(tgt_seq) # 生成子序列屏蔽位(上三角形) B*L*L pad_seq_mask = (pad_mask + seq_mask).gt(0) # 在解码器中,结合两种mask B*L*L # 在第二层的多头注意力机制中,产生context类的mask B * tgt_L * src_L dec_enc_mask = self.mask_obj.padding_mask(src_seq, tgt_seq) # Decoder的第一层为词向量word embedding+位置向量 embedding dec_in = self.word_embedding(tgt_seq) + self.pos_encoding( tgt_pos) # B*L*(h*d=d_model) dec_out = 0 for decoder_layer in self.decoder_layers: # 循环计算每一层 dec_out = decoder_layer(dec_in, enc_out, no_pad_mask, pad_seq_mask, dec_enc_mask) dec_in = dec_out # 上一层的输出等于下一层的输入 return dec_out
class Encoder(nn.Module): """ Encoder由6层EncodeLayer堆叠构成 """ def __init__(self, input_vocab_num, max_seq_len, pad_idx=0): """ :param input_vocab_num: 全部输入序列的词典的单词数 :param max_seq_len: 输入序列最大长度 :param pad_idx: pad的填充位置,默认为0 """ super(Encoder, self).__init__() self.word_embedding = nn.Embedding(input_vocab_num, config.d_model, padding_idx=pad_idx) # 词向量层 N*D self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx) # 位置向量层 (N+1)*D self.encoder_layers = nn.ModuleList([ EncoderLayer() for _ in range(config.layers) ]) # 堆叠n层encoder_layer self.pad_obj = Mask() # mask对象 self.tool = Tools() # 工具对象 def forward(self, src_seq): """ :param src_seq: 输入批序列 B*L :return: 6层encoder子层的计算结果 B*L*? """ src_pos = self.tool.seq_2_pos(src_seq) # 生成输入序列对应的位置序列 B*L # Encoder第一层的输入是词向量word embedding + 位置向量positional encoding B*L*D enc_in = self.word_embedding(src_seq) + self.pos_encoding(src_pos) pad_mask = self.pad_obj.padding_mask( src_seq, src_seq) # pad_mask 由补齐序列产生的屏蔽位 B*L*L no_pad_mask = self.pad_obj.no_padding_mask(src_seq) # 序列补齐位的屏蔽位 B*L*1 enc_out = 0 for encoder_layer in self.encoder_layers: # 循环计算每一层 enc_out = encoder_layer(enc_in, pad_mask, no_pad_mask) enc_in = enc_out # 上一层的输出等于下一层的输入 return enc_out
def main(): print('\n===开始推理测试===\n\n') def index_2_word(lang, seq): """ 转化索引到单词""" seq = [int(idx.detach()) for idx in seq] new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) idx_2_word = [lang['index2word'][i] for i in new_seq] return idx_2_word # 命令行参数 parser = argparse.ArgumentParser(description='Transformer模型推理测试部分!') parser.add_argument('-save_model', default='./results/best_model.chkpt', help='训练好的模型的存放的路径') parser.add_argument('-save_data', default='./results/words_data.pt', help='保存训练和验证的文本序列信息') parser.add_argument('-infer_data_input', default='./data/test.en', help='推理测试输入数据集的路径') parser.add_argument('-infer_data_target', default='./data/test.de', help='推理测试目标数据集的路径') parser.add_argument('-pre_target', default='./results/input_target_infer.txt', help='推理预测结果存放的路径') parser.add_argument('-infer_batch_size', default=32, type=int, help='推理预测阶段的批大小') parser.add_argument('-beam_search_size', default=5, type=int, help='Beam search搜索的宽度') parser.add_argument('-infer_n_best', default=1, type=int, help='通过Beam search后,推理预测出top n的句子') args = parser.parse_args() # 赋值参数 config.args_2_variable_infer(args) # 修改默认参数 # 加载训练&验证字符信息 words_data = torch.load(args.save_data) source_lang = words_data['src_lang'] target_lang = words_data['tgt_lang'] data_obj = words_data['data_obj'] checkpoint = torch.load( args.save_model, map_location='cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer(input_vocab_num=source_lang['n_words'], target_vocab_num=target_lang['n_words'], src_max_len=data_obj['src_max_len'], tgt_max_len=data_obj['tgt_max_len']) transformer.load_state_dict(checkpoint['model']) print('加载预训练的模型参数完成!') infer = Translator(model=transformer, tgt_max_len=data_obj['tgt_max_len']) # 推理预测模型 data_obj = DataProcess() *_, src_tgt_seq = data_obj.word_2_index(args.infer_data_input, args.infer_data_target, source_lang, target_lang) # 测试数据 # 打包批次数据 data_loader = DataLoader(dataset=src_tgt_seq, batch_size=args.infer_batch_size, shuffle=True, drop_last=False) with open(args.pre_target, 'w', encoding='utf-8') as f: for batch_dat in tqdm(data_loader, desc='Inferring...', leave=True): # 迭代推理批次数据 src_seq, tgt_seq = Tools().batch_2_tensor( batch_dat) # 获得输入序列和实际目标序列 src_pos = Tools().seq_2_pos(src_seq) # 得到输入序列的pos位置向量 all_pre_seq, all_pre_seq_p = infer.translate_batch( src_seq, src_pos) # 获得所有预测的结果和对应的概率 for index, pre_seq in enumerate(all_pre_seq): src_word_seq = index_2_word(source_lang, src_seq[index]) # 清洗输入序列并转化为字符 tgt_word_seq = index_2_word(target_lang, tgt_seq[index]) # 清洗目标序列并转化为字符 for seq in pre_seq: # 清洗预测序列并转化为字符 new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) pre_word_seq = [ target_lang['index2word'][idx] for idx in new_seq ] f.write('输入序列->:' + ' '.join(src_word_seq) + '\n') # 写入输入序列 f.write('->预测序列:' + ' '.join(pre_word_seq) + '\n') # 写入预测序列 f.write('==目标序列:' + ' '.join(tgt_word_seq) + '\n\n') # 写入实际序列 print('推理预测序列完毕!')
def __init__(self, transformer, optimizer, criterion): self.transformer = transformer # transformer模型 self.optimizer = optimizer # 优化器 self.criterion = criterion # 损失函数 self.tool = Tools() # 工具类 self.val_acc_all = [0] # 收集验证集所有的准确率 self.save_checkpoint = 0 # 模型最佳保存点的次数 self.val_all_batch = 0 self.train_all_batch = 0 self.train_loss = [] self.val_loss = [] self.train_ppl = [] self.val_ppl = [] self.train_acc = [] self.val_acc = [] if config.visual and config.use_visdom: # Visdom可视化 # 损失初始化 viz_loss = Visdom(env='Transformer_Train_Val_Loss') self.viz_loss = viz_loss t_loss_x, t_loss_y = 0, 0 win_loss_1 = viz_loss.line(np.array([t_loss_x]), np.array([t_loss_y]), opts=dict(title='训练loss', legend=['train'], markers=True, markersize=5)) v_loss_x, v_loss_y = 0, 0 win_loss_2 = viz_loss.line(np.array([v_loss_x]), np.array([v_loss_y]), opts=dict(title='验证loss', legend=['val'], markers=True, markersize=5)) self.win_loss_1 = win_loss_1 self.win_loss_2 = win_loss_2 # ppl点初始化 viz_ppl = Visdom(env='Transformer_Train_Val_PPL') self.viz_ppl = viz_ppl t_ppl_x, t_ppl_y = 0, 0 win_ppl_1 = viz_ppl.line(np.array([t_ppl_x]), np.array([t_ppl_y]), opts=dict(title='训练PPL', legend=['train'], markers=True, markersize=5)) v_ppl_x, v_ppl_y = 0, 0 win_ppl_2 = viz_ppl.line(np.array([v_ppl_x]), np.array([v_ppl_y]), opts=dict(title='验证PPL', legend=['val'], markers=True, markersize=5)) self.win_ppl_1 = win_ppl_1 self.win_ppl_2 = win_ppl_2 # acc点初始化 viz_acc = Visdom(env='Transformer_Train_Val_ACC') self.viz_acc = viz_acc t_acc_x, t_acc_y = 0, 0 win_acc_1 = viz_acc.line(np.array([t_acc_x]), np.array([t_acc_y]), opts=dict(title='验证ACC', legend=['train'], markers=True, markersize=5)) v_acc_x, v_acc_y = 0, 0 win_acc_2 = viz_acc.line(np.array([v_acc_x]), np.array([v_acc_y]), opts=dict(title='验证ACC', legend=['val'], markers=True, markersize=5)) self.win_acc_1 = win_acc_1 self.win_acc_2 = win_acc_2
class Sequence2Sequence(): def __init__(self, transformer, optimizer, criterion): self.transformer = transformer # transformer模型 self.optimizer = optimizer # 优化器 self.criterion = criterion # 损失函数 self.tool = Tools() # 工具类 self.val_acc_all = [0] # 收集验证集所有的准确率 self.save_checkpoint = 0 # 模型最佳保存点的次数 self.val_all_batch = 0 self.train_all_batch = 0 self.train_loss = [] self.val_loss = [] self.train_ppl = [] self.val_ppl = [] self.train_acc = [] self.val_acc = [] if config.visual and config.use_visdom: # Visdom可视化 # 损失初始化 viz_loss = Visdom(env='Transformer_Train_Val_Loss') self.viz_loss = viz_loss t_loss_x, t_loss_y = 0, 0 win_loss_1 = viz_loss.line(np.array([t_loss_x]), np.array([t_loss_y]), opts=dict(title='训练loss', legend=['train'], markers=True, markersize=5)) v_loss_x, v_loss_y = 0, 0 win_loss_2 = viz_loss.line(np.array([v_loss_x]), np.array([v_loss_y]), opts=dict(title='验证loss', legend=['val'], markers=True, markersize=5)) self.win_loss_1 = win_loss_1 self.win_loss_2 = win_loss_2 # ppl点初始化 viz_ppl = Visdom(env='Transformer_Train_Val_PPL') self.viz_ppl = viz_ppl t_ppl_x, t_ppl_y = 0, 0 win_ppl_1 = viz_ppl.line(np.array([t_ppl_x]), np.array([t_ppl_y]), opts=dict(title='训练PPL', legend=['train'], markers=True, markersize=5)) v_ppl_x, v_ppl_y = 0, 0 win_ppl_2 = viz_ppl.line(np.array([v_ppl_x]), np.array([v_ppl_y]), opts=dict(title='验证PPL', legend=['val'], markers=True, markersize=5)) self.win_ppl_1 = win_ppl_1 self.win_ppl_2 = win_ppl_2 # acc点初始化 viz_acc = Visdom(env='Transformer_Train_Val_ACC') self.viz_acc = viz_acc t_acc_x, t_acc_y = 0, 0 win_acc_1 = viz_acc.line(np.array([t_acc_x]), np.array([t_acc_y]), opts=dict(title='验证ACC', legend=['train'], markers=True, markersize=5)) v_acc_x, v_acc_y = 0, 0 win_acc_2 = viz_acc.line(np.array([v_acc_x]), np.array([v_acc_y]), opts=dict(title='验证ACC', legend=['val'], markers=True, markersize=5)) self.win_acc_1 = win_acc_1 self.win_acc_2 = win_acc_2 # 训练&验证 def train_val(self, train_loader, val_loader): print('\n===开始训练&验证===\n') train_log = None val_log = None if config.log: # 模型写入日志 train_log = open((os.path.abspath('.') + '/results/train.log'), 'w', encoding='utf-8') train_log.write('轮次:{epoch:3.0f}, ' '当前批次:{step:3.0f}, ' '累计批次:{total_step:7.0f}, ' '批大小:{batch_size:3.0f}, ' '损失:{loss:10.6f}, ' '该批学习率:{lr:15.10f}\n\n'.format(epoch=0, step=0, total_step=0, batch_size=0, loss=0, lr=0)) val_log = open((os.path.abspath('.') + '/results/val.log'), 'w', encoding='utf-8') val_log.write('轮次:{epoch:3.0f}, ' '批大小:{batch_size:3.0f}, ' '损失:{loss:10.6f}\n\n, '.format(epoch=0, batch_size=0, loss=0)) for epoch in range(config.epochs): # 按轮次训练数据 print('[训练轮次 {}]'.format(epoch)) step = 0 # 每个轮次的批次数 each_batch_loss = [] # 记录每一批次数据的损失 each_batch_lr = [] # 记录每一个批次的学习率 self.transformer.train() # 设置模型为训练状态 total_loss = 0 # 本轮次所有批次数据的训练损失 total_word_num = 0 # 本轮次所有批次数据的词数 total_word_correct = 1 # 本轮次所有批次数据中单词正确的个数 epoch_start_time = datetime.datetime.now() # 一个轮次模型的计算开始时间 for batch_data in tqdm(train_loader, mininterval=1, desc='Training...', leave=False): # 迭代计算批次数据 self.train_all_batch += 1 # 总批次加一 step += 1 # 该轮次批数加一 self.optimizer.zero_grad() # 优化器梯度清零 src_seq, tgt_seq = self.tool.batch_2_tensor( batch_data) # 得到输入和目标序列 B*L B*L pre_tgt = self.transformer(src_seq, tgt_seq) # transformer模型预测结果 real_tgt = tgt_seq[:, 1:].contiguous().view(-1) # 构造实际的目标序列token loss, correct = self.criterion.cal_loss(real_tgt, pre_tgt) loss.backward(retain_graph=True) # 损失反向传播(计算损失后还保留变量) learn_rate = self.optimizer.step_update_lrate() # 更新学习率,优化器步进 total_loss += loss.item() # 累加损失 each_batch_loss.append(loss.detach()) # 获取该批次损失 each_batch_lr.append(learn_rate) non_pad_mask = real_tgt.ne(config.pad) word_num = non_pad_mask.sum().item() total_word_num += word_num total_word_correct += correct # 写入训练日志 if train_log: train_log.write('轮次:{epoch:3.0f}, ' '当前批次:{step:3.0f}, ' '累计批次:{total_step:7.0f}, ' '批大小:{batch_size:3.0f}, ' '损失:{loss:10.6f}, ' '该批学习率:{lr:15.10f}\n'.format( epoch=epoch, step=step, total_step=self.train_all_batch, batch_size=len(batch_data[0]), loss=loss.detach(), lr=learn_rate)) if config.visual and config.use_visdom: self.viz_loss.line(np.array([loss.cpu().detach()]), np.array([self.train_all_batch]), win=self.win_loss_1, update='append') self.train_loss.append(loss.cpu().detach()) loss_per_word = total_loss / total_word_num # 平均到每个单词的损失 ppl = math.exp(min(loss_per_word, 100)) # 困惑度,越小越好 acc = total_word_correct / total_word_num # 平均到每个单词的准确率acc acc = 100 * acc # 准确率,越大越好 epoch_end_time = datetime.datetime.now() self.train_ppl.append(ppl) self.train_acc.append(acc) if config.visual and config.use_visdom: self.viz_ppl.line(np.array([ppl]), np.array([epoch]), win=self.win_ppl_1, update='append') self.viz_acc.line(np.array([acc]), np.array([epoch]), win=self.win_acc_1, update='append') print('批数 %4.0f' % step, '| 累积批数 %8.0f' % self.train_all_batch, '| 批大小 %3.0f' % config.batch_size, '| 耗时', epoch_end_time - epoch_start_time) print('训练', '| 困惑度PPL↓ %10.6f' % ppl, '| 准确率ACC↑ %10.5f' % acc, '| 首批损失 %10.5f' % each_batch_loss[0], '| 尾批损失 %10.5f' % each_batch_loss[-2]) # 最后一批不满完整batch_size,不显示 train_log.write( '困惑度PPL↓:{PPL:10.5f}, 准确率ACC↑:{ACC:10.5f}\n\n'.format(PPL=ppl, ACC=acc)) self.evaluate(val_loader, epoch, self.train_all_batch, val_log) # 每一个训练轮次结束,验证一次 pic.train_loss(self.train_loss) pic.val_loss(self.val_loss) pic.ppl(self.train_ppl, self.val_ppl) pic.acc(self.train_acc, self.val_acc) # TODO 训练结束,待保存训练模型 print('训练&验证结束!') # 每个轮次训练结束,用验证数据验证模型 def evaluate(self, data_loader, epoch, batch, val_log): self.transformer.eval() # 设置模型为验证状态 total_loss = 0 # 本轮次所有批次数据的训练损失 total_word_num = 0 # 本轮次所有批次数据的词数 total_word_correct = 0 # 本轮次所有批次数据中单词正取的个数 with torch.no_grad(): # 设置验证产生的损失不更新模型 for batch_data in tqdm(data_loader, mininterval=1, desc='Validating...', leave=False): # 迭代计算批次数据 src_seq, tgt_seq = self.tool.batch_2_tensor( batch_data) # 获取输入序列和验证序列 pre_tgt = self.transformer(src_seq, tgt_seq) # 模型预测的目标序列 real_tgt = tgt_seq[:, 1:].contiguous().view(-1) # 构建真实的目标序列 loss, correct = self.criterion.cal_loss(real_tgt, pre_tgt) total_loss += loss.item() # 累加损失 self.val_all_batch += 1 # 累加该轮次验证批次数 non_pad_mask = real_tgt.ne(config.pad) word_num = non_pad_mask.sum().item() total_word_num += word_num total_word_correct += correct if val_log: val_log.write('轮次:{epoch:3.0f}, ' '批大小:{batch_size:3.0f}, ' '损失:{loss:10.6f}\n'.format( epoch=epoch, batch_size=len(batch_data[0]), loss=loss.detach())) if config.visual and config.use_visdom: # Visdom 可视化 self.viz_loss.line(np.array([loss.cpu().detach()]), np.array([self.val_all_batch]), win=self.win_loss_2, update='append') self.val_loss.append(loss.cpu().detach()) loss_per_word = total_loss / total_word_num # 平均到每个单词的损失 ppl = math.exp(min(loss_per_word, 100)) # 困惑度,越小越好 acc = total_word_correct / total_word_num # 平均到每个单词的准确率acc acc = 100 * acc # 准确率,越大越好 self.val_acc_all.append(acc) # 收集每一次验证集的准确率 self.val_ppl.append(ppl) self.val_acc.append(acc) if config.visual and config.use_visdom: self.viz_ppl.line(np.array([ppl]), np.array([epoch]), win=self.win_ppl_2, update='append') self.viz_acc.line(np.array([acc]), np.array([epoch]), win=self.win_acc_2, update='append') print('验证', '| 困惑度PPL↓ %10.5f' % ppl, '| 准确率ACC↑ %10.5f' % acc) if val_log: val_log.write( '困惑度PPL↓:{PPL:10.5f}, 准确率ACC↑:{ACC:10.5f}\n\n'.format( PPL=ppl, ACC=acc)) # 模型保存点 if config.save_trained_model: model_state_dict = self.transformer.state_dict() # 保存训练模型状态 checkpoint = { # 保存点的信息 'model': model_state_dict, 'settings': config, 'epoch': epoch, 'total batch': batch } if config.save_trained_model_type == 'all': model_name = os.path.abspath( '.') + '/all' + '_acc_{acc:3.3f}.chkpt'.format(acc=100 * acc) torch.save(checkpoint, model_name) elif config.save_trained_model_type == 'best': model_name = os.path.abspath('.') + '/results/best_model.chkpt' if acc >= max(self.val_acc_all): torch.save(checkpoint, model_name) self.save_checkpoint += 1 print('已经第{}次更新模型最佳保存点!'.format(self.save_checkpoint)) # 批序列推理 def infer(self, data_loader, source_lang, target_lang, tgt_max_len): print('\n===开始推理测试===\n\n') def index_2_word(lang, seq): """ 转化索引到单词""" seq = [int(idx.detach()) for idx in seq] new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) if type(lang) != dict: idx_2_word = [lang.index2word[i] for i in new_seq] else: idx_2_word = [lang['index2word'][i] for i in new_seq] return idx_2_word if config.save_trained_model: # 如果有预训练好的模型保存点 # 加载训练&验证字符信息 words_data = torch.load(config.save_data) source_lang = words_data['src_lang'] target_lang = words_data['tgt_lang'] data_obj = words_data['data_obj'] checkpoint = torch.load( config.checkpoint, map_location='cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer(input_vocab_num=source_lang['n_words'], target_vocab_num=target_lang['n_words'], src_max_len=data_obj['src_max_len'], tgt_max_len=data_obj['tgt_max_len']) transformer.load_state_dict(checkpoint['model']) self.transformer = transformer print('加载预训练的模型参数完成!') infer = Translator(self.transformer, tgt_max_len) # 批次数据推理类 with open((os.path.abspath('.') + '/results/input_target_infer.txt'), 'w', encoding='utf-8') as f: for batch_dat in tqdm(data_loader, desc='Inferring...', leave=False): # 迭代推理批次数据 src_seq, tgt_seq = self.tool.batch_2_tensor( batch_dat) # 获得输入序列和实际目标序列 src_pos = self.tool.seq_2_pos(src_seq) # 得到输入序列的pos位置向量 all_pre_seq, all_pre_seq_p = infer.translate_batch( src_seq, src_pos) # 获得所有预测的结果和对应的概率 for index, pre_seq in enumerate(all_pre_seq): src_word_seq = index_2_word(source_lang, src_seq[index]) # 清洗输入序列并转化为字符 tgt_word_seq = index_2_word(target_lang, tgt_seq[index]) # 清洗目标序列并转化为字符 for seq in pre_seq: # 清洗预测序列并转化为字符 new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) if type(target_lang) != dict: pre_word_seq = [ target_lang.index2word[idx] for idx in new_seq ] else: pre_word_seq = [ target_lang['index2word'][idx] for idx in new_seq ] f.write('输入序列->:' + ' '.join(src_word_seq) + '\n') # 写入输入序列 f.write('->预测序列:' + ' '.join(pre_word_seq) + '\n') # 写入预测序列 f.write('==目标序列:' + ' '.join(tgt_word_seq) + '\n\n') # 写入实际序列 print('推理预测序列完毕!')
def main(): """ 从Diagnosis序列生成Concept序列的特殊测试(验证)模式,计算输入序列是同一个情况下,目标序列是不同时, 计算预测序列和目标序列的loss :param candidate_k: 候选的concept个数 :param train: 为集成模型二分类准备数据需要对train数据生成loss :param val: 为集成模型二分类准备数据需要对val数据生成loss :return: 写出loss文件 """ print('数据集路径:', config.data_path) print('输出路径:', config.save_path) word_data = torch.load(config.save_data) # 加载保存的训练-验证的输入、目标序列的token信息 src_lang = word_data['src_lang'] # 输入序列token信息 tgt_lang = word_data['tgt_lang'] # 目标序列token信息 data_obj = word_data['data_obj'] # 数据对象 checkpoint = torch.load( # 加载保存好的模型训练保存点 config.save_model_checkpoint, map_location=lambda storage, loc: storage.cuda(0)) transformer = Transformer( # 定义transformer模型 input_vocab_num=src_lang['n_words'], target_vocab_num=tgt_lang['n_words'], src_max_len=data_obj['src_max_len'], tgt_max_len=data_obj['tgt_max_len']).cuda() transformer.load_state_dict(checkpoint['model']) # 给transformer模型加载训练好的参数 transformer.eval() # 验证模式,保证模型参数不变 print('加载预训练的模型参数完成!\n') data_object = DataProcess() # 数据处理类对象 *_, src_tgt_seq = data_object.word_2_index(config.infer_input_k, config.infer_target_k, src_lang, tgt_lang) # 测试数据 data_loader = DataLoader( dataset=src_tgt_seq, batch_size=config.k, # 根据k的大小,可以乘上2倍,不影响 shuffle=False, drop_last=False) # 打包批次数据 all_batch_loss = [] # 所有批次数据中每条数据的loss with torch.no_grad(): # 不更新模型梯度情况下测试(验证) for candidate_k_data in tqdm(data_loader, mininterval=1, ncols=1, desc='特殊测试中', leave=True): src_seq, tgt_seq = Tools().batch_2_tensor( candidate_k_data) # 获得候选的k个输入、目标序列 _, pre_seq = transformer.forward(src_seq, tgt_seq) # 模型预测的序列 tgt_seq = tgt_seq[:, 1:] # 构建真实的目标序列 assert pre_seq.size()[0] == tgt_seq.size( )[0], '预测序列和目标序列的条数不一致,无法计算loss!' for i in range(pre_seq.size()[0]): # 循环计算这一批次数据每条的loss seq_len = 0 for j in tgt_seq[i]: if j != config.pad: seq_len += 1 # 计算当前目标序列的实际长度 loss = F.cross_entropy( # 计算当前预测序列和实际序列的loss pre_seq[i], tgt_seq[i], ignore_index=config.pad, reduction='elementwise_mean') loss = loss.detach().cpu().tolist() # if config.loss_cal == 'sum': # 如果模型的loss是sum模式,要除以实际目标序列的长度 # loss /= seq_len FIXME 这里用mean的效果远比sum好,要找出是什么原因 all_batch_loss.append(round(loss, 7)) # 计算出的loss要除以实际序列长度 infer_target_k = open(config.infer_target_k, 'r', encoding='utf-8').readlines() assert len(all_batch_loss) == len(infer_target_k), '语义损失数和条数不一致!!!' with open(config.result_loss, 'w', encoding='utf-8') as f: # 写出序列loss文件 for idx, i in enumerate(all_batch_loss): if idx != len(all_batch_loss) - 1: f.write(str(i) + '\t' + infer_target_k[idx].split('\n')[0]) f.write('\n') else: f.write(str(i) + '\t' + infer_target_k[idx].split('\n')[0]) end_time = time.time() take_time = end_time - start_time per_item_time = round(take_time / (len(all_batch_loss) / config.k), 7) print('耗时{},每条耗时{}'.format(take_time, per_item_time))
def main(candidate_k): show_keyword(config, candidate_k) print('===开始推理测试===\n') def index_2_word(lang, seq): """ 转化索引到单词""" seq = [int(idx.detach()) for idx in seq] new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) idx_2_word = [lang['index2word'][i] for i in new_seq] return idx_2_word # 加载训练&验证字符信息 words_data = torch.load(config.save_data) # 加载已经保存的输入、目标序列token信息 source_lang = words_data['src_lang'] # 输入序列token信息 target_lang = words_data['tgt_lang'] # 目标序列token信息 data_obj = words_data['data_obj'] checkpoint = torch.load( config.save_model_checkpoint, map_location='cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer( # 定义transformer模型 input_vocab_num=source_lang['n_words'], target_vocab_num=target_lang['n_words'], src_max_len=data_obj['src_max_len'], tgt_max_len=data_obj['tgt_max_len']) transformer.load_state_dict(checkpoint['model']) # 加载transformer模型预训练参数 transformer.eval() # 模型验证模式,不更改参数 print('加载预训练的模型参数完成!\n') infer = Translator(model=transformer, tgt_max_len=data_obj['tgt_max_len']) # 推理预测模型 data_obj = DataProcess() *_, src_tgt_seq = data_obj.word_2_index( config.test_input, config.test_target, source_lang, target_lang, father=config.infer_father if config.enc_father else None) # 测试数据 # 打包批次数据 data_loader = DataLoader(dataset=src_tgt_seq, batch_size=candidate_k, shuffle=False, drop_last=False) all_sent_gene_p = [] # 所有句子的beam生成概率 all_sent_scores, all_sent_scores_sf = [], [] # 所有生成句子的得分 with open(config.infer_result, 'w', encoding='utf-8') as f: for batch_data in tqdm(data_loader, ncols=1, desc='推理测试中...', leave=True): # 迭代推理批次数据 if config.has_father: # 编码父概念 src_seq, tgt_seq, father_seq, none_seq = Tools( ).batch_2_tensor(batch_data, source_lang['word2index']['none']) src_pos = Tools().seq_2_pos(src_seq) # 得到输入序列的pos位置向量 father_pos = Tools().seq_2_pos(father_seq) # 得到父概念序列的pos位置向量 batch_pre_seq, batch_sent_scores, batch_sent_scores_sf, batch_gene_p = infer.translate_batch( src_seq, src_pos, father_seq=father_seq, father_pos=father_pos, none_mask=none_seq) # 预测和对应概率 else: # 不编码父概念 src_seq, tgt_seq = Tools().batch_2_tensor( batch_data) # 获得输入序列和实际目标序列 src_pos = Tools().seq_2_pos(src_seq) # 得到输入序列的pos位置向量 batch_pre_seq, batch_sent_scores, batch_sent_scores_sf, batch_gene_p = infer.translate_batch( src_seq, src_pos) # 获得预测结果和概率 batch_sent_scores = [ round(i.cpu().item(), 3) for i in batch_sent_scores ] batch_sent_scores_sf = [ round(i.cpu().item(), 3) for i in batch_sent_scores_sf ] all_sent_gene_p += batch_gene_p all_sent_scores += batch_sent_scores all_sent_scores_sf += batch_sent_scores_sf pre_word_seq = None # 推理预测的单词序列 for index, pre_seq in enumerate(batch_pre_seq): src_word_seq = index_2_word(source_lang, src_seq[index]) # 清洗输入序列并转化为字符 tgt_word_seq = index_2_word(target_lang, tgt_seq[index]) # 清洗目标序列并转化为字符 for seq in pre_seq: # 清洗预测序列并转化为字符 new_seq = [] for i in seq: if i != config.sos and i != config.eos and i != config.pad: new_seq.append(i) pre_word_seq = [ target_lang['index2word'][idx] for idx in new_seq ] f.write('输入序列->:' + ' '.join(src_word_seq) + '\n') # 写入输入序列 f.write('->预测序列:' + ' '.join(pre_word_seq) + '\n') # 写入预测序列 f.write('==目标序列:' + ' '.join(tgt_word_seq) + '\n\n') # 写入实际序列 with open(config.generate_pro, 'w', encoding='utf-8') as gene_f: # 写出每一句话的生成概率 for idx, p in enumerate(all_sent_scores_sf): gene_f.write(str(idx)) gene_f.write(' ') gene_f.write(str(p)) gene_f.write('\n') print('推理预测序列完毕!')