def train_batch(batch: Batch, model: Seq2Seq, criterion, optimizer, *, pack_seq=True, forcing_ratio=0.5, partial_forcing=True, rl_ratio: float = 0, vocab=None, grad_norm: float = 0): if not pack_seq: input_lengths = None else: input_lengths = batch.input_lengths optimizer.zero_grad() input_tensor = batch.input_tensor.to(DEVICE) target_tensor = batch.target_tensor.to(DEVICE) ext_vocab_size = batch.ext_vocab_size out = model(input_tensor, target_tensor, input_lengths, criterion, forcing_ratio=forcing_ratio, partial_forcing=partial_forcing, ext_vocab_size=ext_vocab_size) if rl_ratio > 0: assert vocab is not None sample_out = model(input_tensor, saved_out=out, criterion=criterion, sample=True, ext_vocab_size=ext_vocab_size) baseline_out = model(input_tensor, saved_out=out, visualize=False, ext_vocab_size=ext_vocab_size) scores = eval_batch_output([ex.tgt for ex in batch.examples], vocab, batch.oov_dict, sample_out.decoded_tokens, baseline_out.decoded_tokens) greedy_rouge = scores[1]['l_f'] neg_reward = greedy_rouge - scores[0]['l_f'] # if sample > baseline, the reward is positive (i.e. good exploration), rl_loss is negative rl_loss = neg_reward * sample_out.loss loss = (1 - rl_ratio) * out.loss + rl_ratio * rl_loss else: loss = out.loss greedy_rouge = None loss.backward() if grad_norm > 0: clip_grad_norm_(model.parameters(), grad_norm) optimizer.step() target_length = target_tensor.size(0) return loss.item() / target_length, greedy_rouge
def train_batch(batch: Batch, model: Seq2Seq, criterion, optimizer, *, pack_seq=True, forcing_ratio=0.5, partial_forcing=True, sample=False, rl_ratio: float = 0, vocab=None, grad_norm: float = 0, show_cover_loss=False): if not pack_seq: input_lengths = None else: # use PAD input_lengths = batch.input_lengths mask = create_mask(input_lengths) optimizer.zero_grad() input_tensor = batch.input_tensor.to(DEVICE) target_tensor = batch.target_tensor.to(DEVICE) mask = mask.to(DEVICE) ext_vocab_size = batch.ext_vocab_size out = model(input_tensor, target_tensor, input_lengths, criterion, forcing_ratio=forcing_ratio, partial_forcing=partial_forcing, sample=sample, ext_vocab_size=ext_vocab_size, include_cover_loss=show_cover_loss, mask=mask) if rl_ratio > 0: assert vocab is not None # sample sample_out = model(input_tensor, saved_out=out, criterion=criterion, sample=True, ext_vocab_size=ext_vocab_size, mask=mask) # greedy baseline_out = model(input_tensor, saved_out=out, visualize=False, ext_vocab_size=ext_vocab_size, mask=mask) scores = eval_batch_output([ex.tgt for ex in batch.examples], vocab, batch.oov_dict, sample_out.decoded_tokens, baseline_out.decoded_tokens) greedy_rouge = scores[1]['l_f'] neg_reward = greedy_rouge - scores[0][ 'l_f'] # greedy_rouge - sample_rouge # if sample > baseline, the reward is positive (i.e. good exploration), rl_loss is negative # TODO 关于强化学习loss的计算有些问题 rl_loss = neg_reward * sample_out.loss rl_loss_value = neg_reward * sample_out.loss_value # 结合强化学习loss和交叉熵损失 loss = (1 - rl_ratio) * out.loss + rl_ratio * rl_loss loss_value = (1 - rl_ratio) * out.loss_value + rl_ratio * rl_loss_value else: loss = out.loss loss_value = out.loss_value greedy_rouge = None loss.backward() # 反向传播 # 梯度裁剪 if grad_norm > 0: clip_grad_norm_(model.parameters(), grad_norm) # 参数更新 optimizer.step() target_length = target_tensor.size(0) # 整个batch的loss/decode步数,相当于一个平均 return loss_value / target_length, greedy_rouge