enc_inputs = enc_inputs.to(device) dec_inputs = dec_inputs.to(device) targets = targets.to(device) lengths = lengths.to(device) # forward logp, z_0, z_T, mu, logvar = model(enc_inputs, dec_inputs, lengths) # calculate loss NLL_loss = NLL(logp, targets, lengths + 1) # KL loss log_p_z = log_Normal_standard(z_T, dim=1) log_q_z = log_Normal_diag(z_0, mu, logvar, dim=1) KL_loss = torch.sum(-(log_p_z - log_q_z)) KL_weight = linear_anneal(step, len(dataloaders['train']) * 10) loss = (NLL_loss + KL_weight * KL_loss) / bsize # cumulate totals['ELBO'] += loss.item() * bsize totals['NLL'] += NLL_loss.item() totals['KL'] += KL_loss.item() totals['words'] += torch.sum(lengths).item() # backward and optimize if split == 'train': step += 1 optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 5)
targets = targets.to(device) lengths = lengths.to(device) # forward logp, mu, logvar, kld, aux_loss = model(enc_inputs, dec_inputs, lengths) # calculate loss NLL_loss = NLL(logp, targets, lengths + 1) KL_loss = KL_div(mu, logvar) if ep < 5: KL_weight = 0.1 beta = 0. else: KL_weight = linear_anneal(step - len(dataloaders['train']) * 5, len(dataloaders['train']) * 5, initial=0.1) beta = 0.5 if ep < 1: RNN_weight = 0.05 else: RNN_weight = linear_anneal(step - len(dataloaders['train']) * 1, len(dataloaders['train']) * 4, initial=0.05) loss = (NLL_loss + KL_weight * KL_loss + RNN_weight * kld + beta * aux_loss) / bsize # cumulate totals['ELBO'] += loss.item() * bsize totals['NLL'] += NLL_loss.item()