def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state( encoder_hidden) # (h,c) = ([1, B, hid_dim], [1, B, hid_dim]) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 # print("y_t_1:", y_t_1, y_t_1.size()) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] # 摘要的下一个单词的编码 # print("target-iter:", target, target.size()) # print("final_dist:", final_dist, final_dist.size()) # input("go on>>") # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000 gold_probs = torch.gather( final_dist, 1, target.unsqueeze(1)).squeeze() # 取出目标单词的概率gold_probs step_loss = -torch.log( gold_probs + config.eps) # 最大化gold_probs,也就是最小化step_loss(添加负号) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item()
def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.item()
def train_one_batch(self, batch): # enc_batch是包含unk的序列 # c_t_1是初始上下文向量 # extra_zeros:oov词汇表概率,[batch_size, batch.max_art_oovs] enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) # dec_batch是普通摘要序列,包含unk,target_batch是目标词序列,不包含unk,unk的词用len(vocabe)+oov相对位置代替 dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() # [batch, seq_lens, 2*hid_dim],[batch*max(seq_lens), 2*hid_dim],[2, batch, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) # (h,c) = ([1, batch, hid_dim], [1, batch, hid_dim]) # 之前的hidden state是双向的[2, batch, hid_dim],需要转成1维的[1, batch, hid_dim],作为新的decoder的hidden输入 s_t_1 = self.model.reduce_state(encoder_hidden) # h,c step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 y_t_1 = dec_batch[:, di] # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的vocab size final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # 摘要的下一个单词的编码,[B] target = target_batch[:, di] # [B,1] target_i = target.unsqueeze(1) # 取出目标单词的概率//取出final_dist中,target中对应位置的数据(对于目标单词预测的概率) gold_probs = torch.gather(final_dist, 1, target_i).squeeze() #print(gold_probs) # if gold_probs <= 0: # print('*******loss less than 0 ***********') # gold_probs = 1e-2 # print('pro has been modified', gold_probs) # print('\n') # 单个词的预测损失 # 加入绝对值 step_loss = -torch.log(torch.abs(gold_probs) + 1e-8) #print('') if config.is_coverage: # 取当前t步attention向量,和之前t-1步attention和向量,的min值做sum,当作额外的coverage loss来压制重复生成。 # 迫使loss让当前第t步的attention向量attn_dist值,尽可能比之前t-1步attention和向量的值小。(大的的attention值意味着之前可能被预测生成了这个词) step_coverage_loss = torch.sum( torch.min(torch.abs(attn_dist), torch.abs(coverage)), 1) #print('step_coverage_loss is ', step_coverage_loss) # 加个\lambda 系数,表示多大程度考虑这个压制重复的coverage loss step_loss = step_loss + config.cov_loss_wt * torch.abs( step_coverage_loss) # 初始时的coverage覆盖向量,就更新成累加了 coverage = next_coverage # mask的部位不计入损失 step_mask = dec_padding_mask[:, di] step_loss = torch.abs(step_loss) * torch.abs(step_mask) step_losses.append(step_loss) sum_losses = torch.abs(torch.sum(torch.stack(step_losses, 1), 1)) # print('sum_losses is ',sum_losses) # 序列的整体损失 # print('dec_lens_var is ', dec_lens_var) batch_avg_loss = sum_losses / (torch.abs(dec_lens_var) + 1) # 整个batch的整体损失 loss = torch.mean(batch_avg_loss) #print('loss from one_batch is ', loss) loss.backward() # self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) # clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) # clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item()
def beam_search(self, batch): # batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context = c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size)] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if USE_CUDA: y_t_1 = y_t_1.to(DEVICE) all_state_h =[] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len(results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
def train_one_batch(self, batch): """ 训练一个batch,返回该batch的loss。 enc_batch: torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码; enc_padding_mask: torch.Size([16, 400]), 对应pad的位置为0,其余为1; enc_lens: numpy.ndarray, 列表内每个元素表示每篇article的单词数; enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码; extra_zeros: torch.Size([16, 文章oov词汇数量]) zero tensor; c_t_1: torch.Size([16, 512]) zero tensor; coverage: Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值 ---------------------------------------- dec_batch: torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD; dec_padding_mask: torch.Size([16, 100]) 对应pad的位置为0,其余为1; max_dec_len: 标量,摘要词语数量,不包含pad dec_lens_var: torch.Size([16] 摘要词汇数量 target_batch: torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD """ enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() """ # 记得修改Batch类添加vocab属性 print("模型输入文章编码:", "*"*100) print("enc_batch:", enc_batch, enc_batch.size()) print("enc_batch[-1]:", enc_batch[-1]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()]) print("-"*50) print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size()) print("-"*50) print("enc_lens:", enc_lens, enc_lens.shape) print("-"*50) print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size()) print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1]) print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()]) print("-"*50) print("extra_zeros:", extra_zeros, extra_zeros.size()) print("-"*50) print("c_t_1:", c_t_1, c_t_1.size()) print("-"*50) print("coverage:", coverage) print("*"*100) print("模型输入摘要编码,包括源和目标:", "*"*100) print("dec_batch:", dec_batch, dec_batch.size()) print("dec_batch[0]:", dec_batch[0]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()]) print("-"*50) print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size()) print("-"*50) print("max_dec_len:", max_dec_len) print("-"*50) print("dec_lens_var", dec_lens_var, dec_lens_var.size()) print("-"*50) print("target_batch:", target_batch, target_batch.size()) print("-"*50) print("target_batch[0]:", target_batch[0], target_batch[0].size()) print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()]) print("*"*100) input("任意键继续>>>") """ # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) # (h,c) = ([1, B, hid_dim], [1, B, hid_dim]) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 # print("y_t_1:", y_t_1, y_t_1.size()) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] # 摘要的下一个单词的编码 # print("target-iter:", target, target.size()) # print("final_dist:", final_dist, final_dist.size()) # input("go on>>") # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000 gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() # 取出目标单词的概率gold_probs step_loss = -torch.log(gold_probs + config.eps) # 最大化gold_probs,也就是最小化step_loss(添加负号) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses/dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item()