#dep_loss_masked = torch.where(deps>0, dep_loss, zero) # This zeros out all positions where deps == 0 #dep_loss_tot = dep_loss_masked.sum() / batch_size dep_loss_tot = dep_loss.masked_fill_( deps==0, 0. ).sum() factor_hints="Factor hints (class_loss=%8.4f, deps_loss=%10.4f, fac=%.8f)" % ( class_loss_tot.item()/batch_size*100., dep_loss_tot.item()/batch_size*100., class_loss_tot.item()/dep_loss_tot.item(), ) #factor hints : (231.14927673339844, 225.23297119140625, 1.0262674932124587) batch_loss = class_loss_tot + args.dep_fac * dep_loss_tot batch_loss.backward() model_opt.step() loss_this = batch_loss.item() loss_recent_tot += loss_this if idx % 10 == 0: print('%.1f%% of epoch %d' % (idx / float(len(train_loader)) * 100, epoch,), end='\r') # Python 3 FTW! if idx % 100 == 0: print(epoch, idx, factor_hints) sentences_since_last_check = (idx-idx_loss_check)*batch_size #if sentences_since_last_check > 50000: # Potentially save every 50000 sentences (~30mins on TitanX) if sentences_since_last_check > 200000: # Potentially save every 200000 sentences (~2hrs on TitanX) loss_recent = loss_recent_tot / float(sentences_since_last_check) # loss per sentence
batch = [x_ids, x_perm_ids, s_ids, s_perm_ids, y_a, y_b] if args.feature_mix: ce_loss_x, ce_loss_s, scl_loss = model.forward_feature_mix(batch) else: ce_loss_x, ce_loss_s, scl_loss = model(batch) ce_loss = (ce_loss_x + ce_loss_s) / 2 if not args.with_summary: ce_loss = ce_loss_x loss = args.lambd * ce_loss + (1 - args.lambd) * scl_loss # print(ce_loss_x, ce_loss_s, scl_loss) loss.backward() count += 1 if (count % args.num_accum == 0): optimizer.step() recoder.log_train(ce_loss_x, ce_loss_s, scl_loss, loss) step += 1 optimizer.zero_grad() if (step >= args.steps): break if (step % args.log_step == 0): begin_eval = True if (step % 10 == 0): bar.update(10) # step += 1