Exemple #1
0
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (params.train_size + 1) // params.batch_size
        train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps)
            
        # Evaluate for one epoch on validation set
        num_steps = (params.val_size + 1) // params.batch_size
        val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False)
        val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps)
        
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict': optimizer.state_dict()}, 
                               is_best=is_best,
                               checkpoint=model_dir)
            
        # If best_eval, best_save_path        
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc
            
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Exemple #2
0
    def __call__(self, val_acc, epoch_idx, model, manager=None):

        time_to_save_best = (self.save_opt == 'best'
                             and val_acc > self.best_val_acc)
        time_to_save_last = (self.save_opt == 'last'
                             and epoch_idx == self.max_epoch)

        if time_to_save_best or time_to_save_last:

            print('Save the checkpoint!')
            self.best_val_acc = val_acc
            self.best_epoch = epoch_idx

            if manager is not None:
                manager.save_task_exclusive_params(model.module, self.task_idx)

            save_checkpoint(model=model.module,
                            manager=manager,
                            chkpt_dir=self.chkpt_dir)
        return
Exemple #3
0
            best_loss = valid_loss
            best_psnr = valid_psnr

        print('* learning rate: {}'.format(lr))
        print('* PSNR: {:.4f}'.format(valid_psnr))
        print('* best PSNR: {:.4f} @ epoch: {}\n'.format(
            best_psnr, best_epoch + 1))

        ######################
        # Save checkpoint
        ######################
        save_checkpoint(
            {
                'epoch': epoch,
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'valid_psnr': valid_psnr,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, os.path.join(args.output_dir, checkpoint_name), is_best)

        ######################
        # TensorBoard
        ######################

        summary_writer.add_scalar('learning_rate', lr, epoch + 1)
        summary_writer.add_scalars('loss', {
            'train': train_loss,
            'valid': valid_loss
        }, epoch + 1)
        summary_writer.add_scalar('psnr', valid_psnr, epoch + 1)
Exemple #4
0
    checkpoint = torch.load(args.ia_resume)
    ia_model.load_state_dict(checkpoint['state_dict'])

columns = [
    'ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'te_top5_acc',
    'time', 'mem_usage'
]
if args.ia:
    columns = columns[:-2] + [
        'IA_tr_loss', 'IA_tr_acc', 'IA_te_loss', 'IA_te_acc', 'IA_te_top5_acc'
    ] + columns[-2:]
    ia_res = {'loss': None, 'accuracy': None, 'top5_accuracy': None}

utils.save_checkpoint(args.dir,
                      start_epoch,
                      epoch=start_epoch,
                      state_dict=model.state_dict(),
                      optimizer=optimizer.state_dict())

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()

    if not args.no_schedule:
        e = epoch - start_epoch if args.reset_resume else epoch
        total_e = args.epochs - start_epoch if args.reset_resume else args.epochs
        if args.step_schedule:
            lr = schedule_piecewise_const(e)
        elif args.linear_annealing:
            lr = schedule_variant(e)
        else:
            lr = schedule(e, total_e)