if ib % args.optim_every == 0: optimizer.step() optimizer.zero_grad() T7 = time.time() Timer['optim'] = T7 - T6 # log_obj['summary_nwords'] = int(np.mean([summ.count(" ")+1 for summ in sampled_summaries])) avg_total = total_sampled_scores.mean().item() total_score_history.append(avg_total) log_obj['summary_nwords'] = int(np.mean(sampled_end_idxs)) log_obj['loss'] = Loss.item() log_obj['total_score'] = avg_total log_obj['count'] = batch_size logplot.cache(log_obj, prefix="T_") Tfinal = time.time() Timer['total'] = Tfinal - T1 # print(Timer) if (time.time() - time_save > args.save_every): print("==========================================") print(bodies[0]) print("-----------") print(sampled_summaries[0]) print("-----------") print("Total score:", total_sampled_scores[0].item()) for scorer in scorers: print(scorer['name'] + " score:", scores_track[scorer['name'] + "_scores"][0].item())
is_next) loss.backward() is_next_acc = is_next.eq(torch.argmax(is_next_logits, dim=1)).float().mean().item() num_predicts = (1.0 - lm_label_ids.eq(-1)).sum().item() mlm_acc = (lm_label_ids.view(-1).eq( torch.argmax(mlm_logits, dim=2).view(-1)).float().sum() / num_predicts).item() if ib % args.optim_every == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() torch.cuda.empty_cache() summ.cache( { "loss": loss.item(), "mlm_acc": mlm_acc, "is_next_acc": is_next_acc }, prefix="T_") if time.time() - time_save > 60.0: summ.save(printing=True) time_save = time.time() torch.save( model.state_dict(), "/home/phillab/models/news_bert_bs" + str(args.optim_every * args.train_batch_size) + ".bin")
if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") kw_cov.model, optimizer = amp.initialize(kw_cov.model, optimizer, opt_level="O1") # For now O1. See details at https://nvidia.github.io/apex/amp.html time_save = time.time() optim_every = 4 for ib, batch in enumerate(dataloader): contents, summaries = batch loss, acc = kw_cov.train_batch(contents, summaries) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if ib%optim_every == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() logplot.cache({"loss": loss.item(), "accuracy": acc, "count": len(batch)}, prefix="T_") if time.time()-time_save > 60.0: logplot.save(printing=True) time_save = time.time() kw_cov.save_model("/home/phillab/models/bert_coverage_"+args.experiment+".bin")
model.train() sources, targets = map_batch(batch, args.task) loss = model.train_batch(sources, targets, no_preinput=no_preinput) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if ib % args.optim_every == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() summ.cache({"loss": loss.item(), "count": len(batch)}, prefix="T_") if time.time() - time_save > 60.0: print("Starting the eval") model.eval() with torch.no_grad(): for batch in tqdm.tqdm(dl_dev): sources, targets = map_batch(batch, args.task) loss = model.train_batch(sources, targets, no_preinput=no_preinput) summ.cache({ "loss": loss.item(), "count": len(batch) },