def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_data: (dict) training data with keys 'data' and 'labels' val_data: (dict) validaion data with keys 'data' and 'labels' optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + 1) // params.batch_size train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True) train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps) # Evaluate for one epoch on validation set num_steps = (params.val_size + 1) // params.batch_size val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False) val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def __call__(self, val_acc, epoch_idx, model, manager=None): time_to_save_best = (self.save_opt == 'best' and val_acc > self.best_val_acc) time_to_save_last = (self.save_opt == 'last' and epoch_idx == self.max_epoch) if time_to_save_best or time_to_save_last: print('Save the checkpoint!') self.best_val_acc = val_acc self.best_epoch = epoch_idx if manager is not None: manager.save_task_exclusive_params(model.module, self.task_idx) save_checkpoint(model=model.module, manager=manager, chkpt_dir=self.chkpt_dir) return
best_loss = valid_loss best_psnr = valid_psnr print('* learning rate: {}'.format(lr)) print('* PSNR: {:.4f}'.format(valid_psnr)) print('* best PSNR: {:.4f} @ epoch: {}\n'.format( best_psnr, best_epoch + 1)) ###################### # Save checkpoint ###################### save_checkpoint( { 'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss, 'valid_psnr': valid_psnr, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.output_dir, checkpoint_name), is_best) ###################### # TensorBoard ###################### summary_writer.add_scalar('learning_rate', lr, epoch + 1) summary_writer.add_scalars('loss', { 'train': train_loss, 'valid': valid_loss }, epoch + 1) summary_writer.add_scalar('psnr', valid_psnr, epoch + 1)
checkpoint = torch.load(args.ia_resume) ia_model.load_state_dict(checkpoint['state_dict']) columns = [ 'ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'te_top5_acc', 'time', 'mem_usage' ] if args.ia: columns = columns[:-2] + [ 'IA_tr_loss', 'IA_tr_acc', 'IA_te_loss', 'IA_te_acc', 'IA_te_top5_acc' ] + columns[-2:] ia_res = {'loss': None, 'accuracy': None, 'top5_accuracy': None} utils.save_checkpoint(args.dir, start_epoch, epoch=start_epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict()) for epoch in range(start_epoch, args.epochs): time_ep = time.time() if not args.no_schedule: e = epoch - start_epoch if args.reset_resume else epoch total_e = args.epochs - start_epoch if args.reset_resume else args.epochs if args.step_schedule: lr = schedule_piecewise_const(e) elif args.linear_annealing: lr = schedule_variant(e) else: lr = schedule(e, total_e)