def train_eval(model, train_dataloader, val_dataloader, data_types, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """ Train on all epochs; evaluate on validation set after each epoch """ if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + ".pth.tar") logging.info("Restoring from {}".format(restore_path)) train_util.load_checkpoint(restore_path, model, optimizer) best_val_RMSE = float("inf") for epoch in range(params.num_epochs): logging.info("Epoch {} / {}".format(epoch + 1, params.num_epochs)) pause = False # epoch==1 # Train for one epoch train(model, optimizer, loss_fn, train_dataloader, data_types, metrics, params, pause) # Evaluate for one epoch val_metrics = evaluate(model, loss_fn, val_dataloader, data_types, metrics, params) is_best = val_metrics['RMSE'] <= best_val_RMSE # Save weights train_util.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best, model_dir) if is_best: logging.info("- New best RMSE") best_val_RMSE = val_metrics["RMSE"] best_metrics_path = os.path.join(model_dir, "best_val_metrics.json") train_util.save_dict_to_json(val_metrics, best_metrics_path) last_metrics_path = os.path.join(model_dir, "last_val_metrics.json") train_util.save_dict_to_json(val_metrics, last_metrics_path) optimizer = lr_decay(optimizer, params)
criterion, # 사용할 Loss 함수 optimizer, # 사용할 Optimizer함수 dataloaders['train'], # train 데이터셋 dataloaders['val'], # validation 데이터셋 save_file_name=save_file_name, # 저장할 이름 max_epochs_stop=10, # 몇 epoch 동안 vaild loss의 감소가 없으면 학습을 중단할 것인지 n_epochs=training_epoch, # 최대 몇 epochs 학습할것인지 print_every=1, # 몇 epoch마다 출력할 것인지 early_stop=train_util.Early_stop) # Early_stop을 할것인지 # Loss, Acc 그래프 저장 함수 train_util.save_train_valid_loss(history, model_choice) # 모델 저장 함수 train_util.save_checkpoint(model, path=checkpoint_path, model_name=model_choice) # 랜덤하게 이미지를 한장 뽑아내는 함수 np.random.seed = 100 def random_test_image(): """Pick a random test image from the test directory""" c = np.random.choice(cat_df['category']) root = testdir + c + '/' img_path = root + np.random.choice(os.listdir(root)) return img_path avg_inference_time = 0
def run_models(train_iter, valid_iter, test_iter, num_epochs, device, save_dir, load_checkpoint, fine_tune, dataset, do_condition, use_transformer, early_stopping=True, print_every_x_epoch=1, validate_every_x_epoch=1): # setup or load models, optimizers placement_clstm = PlacementCLSTM(PLACEMENT_CHANNELS, PLACEMENT_FILTERS, PLACEMENT_KERNEL_SIZES, PLACEMENT_POOL_KERNEL, PLACEMENT_POOL_STRIDE, NUM_PLACEMENT_LSTM_LAYERS, PLACEMENT_INPUT_SIZE, HIDDEN_SIZE).to(device) placement_optim = optim.Adam(placement_clstm.parameters(), lr=PLACEMENT_LR) if use_transformer: selection_model = ArrowTransformer(EMBED_DIM, dataset.vocab_size, NUM_TRANSFORMER_LAYERS, MAX_SEQ_LEN, PAD_IDX, TRANSFORMER_DROPOUT, do_condition) else: selection_model = SelectionRNN(NUM_SELECTION_LSTM_LAYERS, SELECTION_INPUT_SIZES[dataset.chart_type], dataset.vocab_size, HIDDEN_SIZE, do_condition).to(device) selection_optim = optim.Adam(selection_model.parameters(), lr=SELECTION_LR) # load model, optimizer states if resuming training best_placement_valid_loss = float('inf') best_placement_precision = 0 best_selection_valid_loss = float('inf') start_epoch = 0 start_epoch_batch = 0 train_clstm = True train_selection = True selection_save = TRANSFORMER_SAVE if use_transformer else SRNN_SAVE run_selection_batch = run_transformer_batch if use_transformer else run_srnn_batch selection_criterion = TransformerLoss(ignore_index=PAD_IDX) if use_transformer else SRNN_CRITERION sub_logdir = datetime.datetime.now().strftime('%m_%d_%y_%H_%M') if load_checkpoint: checkpoint = load_save(load_checkpoint, fine_tune, placement_clstm, selection_model, use_transformer, device) if checkpoint: (start_epoch, start_epoch_batch, best_placement_valid_loss, best_placement_precision, best_selection_valid_loss, train_clstm, train_selection, sub_logdir) = checkpoint writer = SummaryWriter(log_dir=os.path.join(save_dir, 'runs', sub_logdir)) print('Starting training..') for epoch in trange(num_epochs): if epoch < start_epoch: continue print('Epoch: {}'.format(epoch)) epoch_p_loss = 0 epoch_p_precision = 0 epoch_s_loss = 0 # report_memory(device=device, show_tensors=True) for i, batch in enumerate(tqdm(train_iter)): # if resuming from checkpoint, skip batches until starting batch for the epoch if start_epoch_batch > 0: if i + 1 == start_epoch_batch: start_epoch_batch = 0 continue step = epoch * len(train_iter) + i with torch.set_grad_enabled(train_clstm): (placement_loss, placement_acc, placement_precision, clstm_hiddens) = run_placement_batch(placement_clstm, placement_optim, PLACEMENT_CRITERION, batch, device, writer, do_condition, do_train=train_clstm, curr_step=step) with torch.set_grad_enabled(train_selection): (selection_loss, selection_acc) = run_selection_batch(selection_model, selection_optim, selection_criterion, batch, device, clstm_hiddens, do_train=train_selection) epoch_p_loss += placement_loss epoch_p_precision += placement_precision epoch_s_loss += selection_loss writer.add_scalar('loss/train_placement', placement_loss, step) writer.add_scalar('accuracy/train_placement', placement_acc, step) writer.add_scalar('loss/train_selection', selection_loss, step) writer.add_scalar('accuracy/train_selection', selection_acc, step) writer.add_scalar('precision/placement', placement_precision, step) if train_clstm: save_model(placement_clstm, save_dir, CLSTM_SAVE) if train_selection: save_model(selection_model, save_dir, selection_save) save_checkpoint(epoch, i, best_placement_valid_loss, best_placement_precision, best_selection_valid_loss, train_clstm, train_selection, save_dir) epoch_p_loss = epoch_p_loss / len(train_iter) epoch_p_precision = epoch_p_precision / len(train_iter) epoch_s_loss = epoch_s_loss / len(train_iter) if epoch % print_every_x_epoch == 0: print(f'\tAvg. training placement loss per unrolling: {epoch_p_loss:.5f}') print(f'\tAvg. training placement precision: {epoch_p_precision:.5f}') print(f'\tAvg. training selection loss per frame: {epoch_s_loss:.5f}') if epoch % validate_every_x_epoch == 0: (placement_valid_loss, placement_valid_acc, selection_valid_loss, selection_valid_acc, placement_precision) = evaluate(placement_clstm, selection_model, valid_iter, PLACEMENT_CRITERION, selection_criterion, device, writer, epoch / validate_every_x_epoch, do_condition, use_transformer) print(f'\tAvg. validation placement loss per frame: {placement_valid_loss:.5f}') print(f'\tAvg. validation placement precision: {placement_precision:.5f}') print(f'\tAvg. validation selection loss per frame: {selection_valid_loss:.5f}') # track best performing model(s) or save every epoch if early_stopping: better_placement = placement_valid_loss < best_placement_valid_loss #better_placement = placement_precision > best_placement_precision better_selection = selection_valid_loss < best_selection_valid_loss if train_clstm: if better_placement: best_placement_precision = placement_precision best_placement_valid_loss = placement_valid_loss save_model(placement_clstm, save_dir, CLSTM_SAVE) else: print("Placement validation loss increased, stopping CLSTM training") train_clstm = False if train_selection: if better_selection: best_selection_valid_loss = selection_valid_loss save_model(selection_model, save_dir, selection_save) else: print("Selection validation loss increased, stopping selection model training") train_selection = False if not train_clstm and not train_selection: print("Both early stopping criterion met. Stopping early..") break save_checkpoint(epoch + 1, 0, best_placement_valid_loss, best_placement_precision, best_selection_valid_loss, train_clstm, train_selection, save_dir) # evaluate on test set (placement_test_loss, placement_test_acc, selection_test_loss, selection_test_acc, placement_precision) = evaluate(placement_clstm, selection_model, test_iter, PLACEMENT_CRITERION, selection_criterion, device, writer, -1, do_condition, use_transformer) # save training summary stats to json file # load initial summary with open(os.path.join(save_dir, SUMMARY_SAVE), 'r') as f: summary_json = json.loads(f.read()) summary_json = { **summary_json, 'epochs_trained': num_epochs, 'placement_test_loss': placement_test_loss, 'placement_test_accuracy': placement_test_acc, 'placement_test_precision': placement_precision, 'selection_test_loss': selection_test_loss, 'selection_test_accuracy': selection_test_acc, } summary_json = log_training_stats(writer, dataset, summary_json, do_condition, use_transformer) with open(os.path.join(save_dir, SUMMARY_SAVE), 'w') as f: f.write(json.dumps(summary_json, indent=2)) # optimize placement thresholds per level (range) which give highest F2 scores on the valid. set thresholds = optimize_placement_thresholds(placement_clstm, valid_iter, device) with open(os.path.join(save_dir, THRESHOLDS_SAVE), 'w') as f: f.write(json.dumps(thresholds, indent=2))
action='store', default=150, type=int) ap.add_argument('--learning_rate', dest='learning_rate', action='store', default=0.001, type=float) ap.add_argument('--epochs', dest='epochs', action='store', default=5, type=int) ap.add_argument('--gpu', dest='mode', action='store', default="gpu") ap.add_argument('--save_path', dest='save_path', action='store', default='checkpoint.pth') args = ap.parse_args() # Load Data train_data, val_data, test_data, train_loader, val_loader = train_util.load_data( args.data_dir) # Set the network model = train_util.set_network(args.model_name, args.hidden_input) # Train the model train_util.train_network(model, train_loader, val_loader, args.learning_rate, args.epochs, args.mode) # Save the model checkpoint train_util.save_checkpoint(train_data, model, args)