train_loss_seq.append(loss.item()/batch_size) # Test model.eval() test_loss = 0.0 for c in test_countries: inp, target = get_data_tensor(data, c, measure_mode, output_mode=output_mode, cuda=cuda) out_nn, _ = get_net_output(inp, model_type, model, cuda) test_loss += criterion(out_nn, target) test_loss_seq.append(test_loss.item()/len(test_countries)) if test_loss_seq[-1] < min_test_loss: min_test_loss = test_loss_seq[-1] best_state_dict = copy.deepcopy(model.state_dict()) if nnet == 0: nets_min_test_loss = -1. else: nets_min_test_loss = min(hn.nets_min_test_loss) print('Best Model - best test loss:{:.4f} (present loss:{:.4f} - it:{}) - Nets found:{} ({:.4f})'. format(min_test_loss, test_loss_seq[-1], it, nnet, nets_min_test_loss)) loss = 0.0 optimizer.zero_grad() elig = hn.set_net(min_test_loss, best_state_dict, train_loss_seq, test_loss_seq) if elig: hn.save_nets(save_file=file_net) nnet = len(hn.nets_min_test_loss)
# Run the model parallelly if torch.cuda.device_count() > 1: logger.info("Using {} GPUs".format(torch.cuda.device_count())) model = nn.DataParallel(model) # Create loss criterion & optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # Start training logger.info("Training Started".center(60, '#')) for epoch in range(start_epoch, args.epochs): # Train the model train(model, criterion, optimizer, train_iter, device, epoch, logger, args.log_interval, writer, TRG) # Test the model bleu = test(model, criterion, val_iter, device, epoch, logger, args.log_interval, writer, TRG) # Save model # remember best wer and save checkpoint is_best = bleu < best_bleu best_bleu = min(bleu, best_bleu) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_bleu': best_bleu }, is_best, args.model_path, args.store_name) logger.info("Epoch {} Model Saved".format(epoch + 1).center(60, '#')) logger.info("Training Finished".center(60, '#'))
print("True: {}".format( denumericalize(batch.out_text, OUT_TEXT.vocab)[0])) print("Pred: {}".format( denumericalize(pred.argmax(dim=2), OUT_TEXT.vocab)[0])) print(" Gen: {}".format( denumericalize(tgt, OUT_TEXT.vocab)[0])) valid_loss /= len(valid_iterator) print('Epoch: {}, Training loss: {:.2f}, Valid loss: {:.2f}'.format( epoch, training_loss, valid_loss)) # log log_data_list.append([epoch, training_loss, valid_loss]) # TODO(Yoshi): The last model is redundant torch.save(model.state_dict(), model_filepath.replace(".pt", "_epoch-{}.pt".format(epoch))) torch.save(model.state_dict(), model_filepath) with open(model_filepath.replace(".pt", "_IN_TEXT.field"), "wb") as fout: dill.dump(IN_TEXT, fout) with open(model_filepath.replace(".pt", "_OUT_TEXT.field"), "wb") as fout: dill.dump(OUT_TEXT, fout) with open(model_filepath.replace(".pt", "_ID.field"), "wb") as fout: dill.dump(ID, fout) # Write out log df = pd.DataFrame(log_data_list, columns=["epoch", "training_loss", "valid_loss"]) df.to_csv(model_filepath.replace(".pt", "_loss.csv"))
with torch.no_grad(): val_out = model(x_val.cuda(), y_val.cuda(), src_mask=None, tgt_mask=None) val_loss = criterion(val_out.view(-1, ntoken), z_val.view(-1).cuda()) # val_out_top = torch.argmax(val_out, dim=-1) print('z_val:', z_val) print('Pred:', torch.argmax(val_out.view(-1, ntoken), dim=-1)[:100]) print('True:', z_val.view(-1)[:100]) print('-' * 89) print( '| iteration {:6d} | time/iter: {:5.2f}s | validation loss {:5.4f} | ' .format(episode, (time.time() - episode_start_time) / args.eval_interval, val_loss.item())) print('-' * 89) torch.save( { 'model_sate_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'val_loss': val_loss.item() }, 'model_' + str(episode) + '.tar') episode_start_time = time.time()