def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer,
                       loss_fn, metrics, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) - name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    # learning rate schedulers for different models:
    if params.model_version == "resnet18":
        scheduler = StepLR(optimizer, step_size=150, gamma=0.1)
    # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here
    elif params.model_version == "cnn":
        scheduler = StepLR(optimizer, step_size=100, gamma=0.2)

    for epoch in range(params.num_epochs):
     
        scheduler.step()
     
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)        

        val_acc = val_metrics['accuracy']
        is_best = val_acc>=best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict' : optimizer.state_dict()},
                               is_best=is_best,
                               checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Example #2
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None,
                       scheduler=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from %s" % restore_path)
        utils.load_checkpoint(restore_path, model,
                              optimizer if params.optim_restore else None)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
        if scheduler is not None:
            scheduler.step()
        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params,
              epoch)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)

        val_acc = val_metrics['macro_f1']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best macro_f1")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Example #3
0
def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion,
                       accuracy, model_dir, args):

    start_epoch = 0
    best_acc = 0.0
    # learning rate schedulers for different models:
    scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=0.1)

    # TensorboardX setup
    writer = SummaryWriter(log_dir=model_dir)
    # Save best accTop1
    choose_accTop1 = True

    # Save the parameters for export
    result_train_metrics = list(range(args.num_epochs))
    result_test_metrics = list(range(args.num_epochs))

    # If the training is interruptted
    if args.resume:
        # Load checkpoint.
        logging.info('Resuming from checkpoint..')
        resumePath = os.path.join(args.resume, 'last.pth')
        assert os.path.isfile(
            resumePath), 'Error: no checkpoint directory found!'

        checkpoint = torch.load(resumePath)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optim_dict'])
        # resume from the last epoch
        start_epoch = checkpoint['epoch']
        scheduler.step(start_epoch - 1)
        if choose_accTop1:
            best_acc = checkpoint['test_accTop1']
        else:
            best_acc = checkpoint['test_accTop5']
        result_train_metrics = torch.load(
            os.path.join(args.resume, 'train_metrics'))
        result_test_metrics = torch.load(
            os.path.join(args.resume, 'test_metrics'))

    for epoch in range(start_epoch, args.num_epochs):

        scheduler.step()

        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, args.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train_metrics = train(train_loader, model, optimizer, criterion,
                              accuracy, args)

        writer.add_scalar('Train/Loss', train_metrics['train_loss'], epoch + 1)
        writer.add_scalar('Train/AccTop1', train_metrics['train_accTop1'],
                          epoch + 1)
        writer.add_scalar('Train/AccTop5', train_metrics['train_accTop5'],
                          epoch + 1)

        # Evaluate for one epoch on validation set
        test_metrics = evaluate(test_loader, model, criterion, accuracy, args)

        # Find the best accTop1 model.
        if choose_accTop1:
            test_acc = test_metrics['test_accTop1']
        else:
            test_acc = test_metrics['test_accTop5']

        writer.add_scalar('Test/Loss', test_metrics['test_loss'], epoch + 1)
        writer.add_scalar('Test/AccTop1', test_metrics['test_accTop1'],
                          epoch + 1)
        writer.add_scalar('Test/AccTop5', test_metrics['test_accTop5'],
                          epoch + 1)

        result_train_metrics[epoch] = train_metrics
        result_test_metrics[epoch] = test_metrics

        # Save latest train/test metrics
        torch.save(result_train_metrics,
                   os.path.join(model_dir, 'train_metrics'))
        torch.save(result_test_metrics, os.path.join(model_dir,
                                                     'test_metrics'))

        last_path = os.path.join(model_dir, 'last.pth')
        # Save latest model weights, optimizer and accuracy
        torch.save(
            {
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict(),
                'epoch': epoch + 1,
                'test_accTop1': test_metrics['test_accTop1'],
                'test_accTop5': test_metrics['test_accTop5']
            }, last_path)

        # If best_eval, best_save_path
        is_best = test_acc >= best_acc
        if is_best:
            logging.info("- Found better accuracy")
            best_acc = test_acc
            # Save best metrics in a json file in the model directory
            test_metrics['epoch'] = epoch + 1
            utils.save_dict_to_json(
                test_metrics, os.path.join(model_dir,
                                           "test_best_metrics.json"))

            # Save model and optimizer
            shutil.copyfile(last_path, os.path.join(model_dir, 'best.pth'))
    writer.close()
Example #4
0
        model = getattr(model_cfg, args.model)(num_classes=num_classes)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device)
    else:
        model = model.to(device)

    num_params = (sum(p.numel() for p in model.parameters()) / 1000000.0)
    logging.info('Total params: %.2fM' % num_params)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    accuracy = utils.accuracy
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          nesterov=True,
                          weight_decay=args.wd)

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(args.num_epochs))
    train_and_evaluate(model, train_loader, test_loader, optimizer, criterion,
                       accuracy, model_dir, args)

    logging.info('Total time: {:.2f} minutes'.format(
        (time.time() - begin_time) / 60.0))
    state['Total params'] = num_params
    params_json_path = os.path.join(model_dir,
                                    "parameters.json")  # save parameters
    utils.save_dict_to_json(state, params_json_path)
Example #5
0
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_Dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkdir=args.exp_dir)

# Write performance and args to json
prfs_name = os.path.basename(args.exp_dir) + '_prfs.json'
prfs_path = os.path.join(args.exp_dir, prfs_name)
with open(prfs_path, 'w') as fout:
    json.dump(output_dict, fout, indent=4)

#%% Test
if args.save_model:
    pth_dir = 'bioqa/exps/psci/list/test'
    utils.load_checkpoint(os.path.join(args.exp_dir, 'best.pth.tar'), model)
    test_scores = valid_fn_list(model, test_loader, tokenizer, device,
                                args.num_answer, args.ans_thres)

    save_path = os.path.join(args.exp_dir, "test_scores.json")
    utils.save_dict_to_json(test_scores, save_path)
    print(
        '[Test] loss: {0:.3f} | f1: {1:.2f}% | prec: {2:.2f}% | rec: {3:.2f}%\n'
        .format(test_scores['loss'], test_scores['f1'] * 100,
                test_scores['prec'] * 100, test_scores['rec'] * 100))

#%% plot
# utils.plot_prfs(prfs_path)
Example #6
0
def main_train_and_evaluate(model,
                            train_data,
                            val_data,
                            optimizer,
                            loss_fn,
                            params,
                            model_dir,
                            restore_file=None,
                            tb_writer=None,
                            device='cpu',
                            save_each_epoch=False,
                            evol_val=True):
    """
    Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validation data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        params: (Params) hyperparameters/arguments
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
        tb_writer: (SummaryWriter) tensorboard writer
        device: (string) cpu or cuda device
        save_each_epoch: (bool) save model parameters after each epoch if it's set True
        evol_val: (bool) progress of validation error
    """
    if save_each_epoch:
        utils.save_checkpoint(
            {
                'epoch': 0,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=False,
            checkpoint=model_dir,
            save_last=False,
            save_each=save_each_epoch)
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(model_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        checkpoint_dict = utils.load_checkpoint(restore_path, model, optimizer)
        epoch_start_ind = checkpoint_dict['epoch']
    else:
        epoch_start_ind = 0
    if params.score_to_select == 'loss':
        best_val_score = np.inf
    else:
        best_val_score = 0.0  # 0.0 9f accuracy is used then it's 0.0 and best value is compared >=
    #
    if evol_val:
        prog_val = []
    for epoch in range(epoch_start_ind, params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
        # compute number of batches in one epoch (one full pass over the training set)
        train_metrics_mean = train_model(model,
                                         optimizer,
                                         loss_fn,
                                         train_data,
                                         params,
                                         epoch=epoch,
                                         device=device)
        # Evaluate for one epoch on validation set
        print('starting to evaluate')
        val_metrics = evaluate(model, loss_fn, val_data, params, device=device)
        #
        val_score = val_metrics[params.score_to_select]
        if params.score_to_select == 'loss':
            is_best = val_score <= best_val_score
        else:
            is_best = val_score >= best_val_score
        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir,
            save_last=True,
            save_each=save_each_epoch)
        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_score = val_score
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)
        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
        if evol_val:
            prog_val.append(val_metrics)
        if tb_writer:
            tb_writer.add_scalars('Loss', {
                'train': train_metrics_mean['loss'],
                'val': val_metrics['loss']
            }, epoch)
            tb_writer.add_scalars('Val/nDCG', {
                key: val_metrics[key]
                for key in ['nDCG1', 'nDCG5', 'nDCG10']
            }, epoch)
            tb_writer.add_scalars(
                'Val/P',
                {key: val_metrics[key]
                 for key in ['P1', 'P5', 'P10']}, epoch)
            print('Epoch: {} | Validation loss: {}'.format(
                epoch, val_metrics['loss']),
                  flush=True)
        print('Validation loss: {}'.format(val_metrics['loss']), flush=True)
    if evol_val:
        pickle.dump(prog_val,
                    open(os.path.join(model_dir, 'val_metrics_s.pkl'), 'wb'))
    logging.info('done training and validation.')
def train_evaluate_model(word_attn, sent_attn, data_train, data_val,
                         word_optmizer, sent_optimizer, params, model_dir,
                         restore_file, vocab_to_index):
    # reload weights from restore_file if specified

    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        utils.log("Restoring parameters from {}".format(restore_path), logger)
        utils.load_checkpoint(restore_path,
                              word_attn,
                              word_optmizer,
                              spinn=True)
        utils.load_checkpoint(restore_path,
                              sent_attn,
                              sent_optimizer,
                              spinn=False)

    best_val_acc = 0.0
    i = 0

    for epoch in range(params.num_epochs):
        # Run one epoch
        utils.log("Epoch {}/{}".format(epoch + 1, params.num_epochs), logger)
        # compute number of batches in one epoch (one full pass over the training set)
        ### RUn the model over 1 epoch...

        num_steps = (params.train_size + 1) // params.batch_size
        print(num_steps)
        train_model(data_train, word_attn, sent_attn, word_optmizer,
                    sent_optimizer, params, num_steps, vocab_to_index)

        utils.log("-----Starting Evaluation-----", logger)
        num_steps = (params.val_size + 1) // params.batch_size
        val_metrics = evaluate(data_val, word_attn, sent_attn, params,
                               num_steps, vocab_to_index)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'Word_State_dict': word_attn.state_dict(),
                'Word_Optim_dict': word_optmizer.state_dict(),
                'Sent_State_dict': sent_attn.state_dict(),
                'Sent_Optim_dict': sent_optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        if is_best:
            utils.log("- Found new best accuracy", logger)
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

            # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Example #8
0
            '[Valid] loss: {0:.3f} | f1: {1:.2f}% | prec: {2:.2f}% | rec: {3:.2f}%\n'
            .format(valid_scores['loss'], valid_scores['f1'] * 100,
                    valid_scores['prec'] * 100, valid_scores['rec'] * 100))

    # Update output dictionary
    output_dict['prfs'][str('train_' + str(epoch + 1))] = train_scores
    output_dict['prfs'][str('valid_' + str(epoch + 1))] = valid_scores

    # Save scores
    # if valid_scores['loss'] < min_valid_loss:
    #     min_valid_loss = valid_scores['loss']
    is_best = (valid_scores['f1'] > max_valid_f1)

    if is_best == True:
        max_valid_f1 = valid_scores['f1']
        utils.save_dict_to_json(
            valid_scores, os.path.join(args.exp_dir, 'best_val_scores.json'))

    # Save model
    if args.save_model == True:
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_Dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkdir=args.exp_dir)

    # Early stopping
    # if valid_scores['loss']-min_valid_loss > 0: # args.stop_c1) and (max_valid_f1-valid_scores['f1'] > args.stop_c2):
    #     n_worse += 1
Example #9
0
def train_and_evaluate(model,
                       trainloader,
                       validloader,
                       optimizer,
                       criterion,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None):
    ## Train the model and evaluate every epoch.
    # Reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(model_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_valid_acc = 0

    if params.model_version == "resnet18":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=150,
                                                    gamma=0.1)
    elif params.model_version == "cnn":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=100,
                                                    gamma=0.2)

    for epoch in range(params.num_epochs):

        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        train(model, optimizer, criterion, trainloader, metrics, params)

        scheduler.step()

        valid_metrics = evaluate(model, criterion, validloader, metrics,
                                 params)

        valid_acc = valid_metrics['accuracy']
        is_best = valid_acc >= best_valid_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        if is_best:
            logging.info("- Found new best accuracy")
            best_valid_acc = valid_acc

            # Save best validation metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_valid_best_weights.json")
            utils.save_dict_to_json(valid_metrics, best_json_path)

        # Save latest valid metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_valid_last_weights.json")
        utils.save_dict_to_json(valid_metrics, last_json_path)


# if __name__ == '__main__':
#     # Load the parameters from json file
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--model_dir', default='experiments/baseline_standalone', help='Directory containing params.json')
#     parser.add_argument('--restore_file', default=None,
#                         help='Optional, name of the file in --model_dir containing weights to reload before training')  ## 'best' or 'train'
#     args = parser.parse_args()
#     json_path = os.path.join(args.model_dir, 'params.json')
#     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
#     params = utils.Params(json_path)
#
#     # use GPU if available
#     params.cuda = torch.cuda.is_available()
#
#     # Set the random seed for reproducible experiments
#     random.seed(230)
#     torch.manual_seed(230)
#     if params.cuda: torch.cuda.manual_seed(230)
#
#     # Set the logger
#     utils.set_logger(os.path.join(args.model_dir, 'train.log'))
#
#     # Create the input data pipeline
#     logging.info("Loading the datasets...")
#
#     # fetch dataloaders, considering full-set vs. sub-set scenarios
#     if params.subset_percent < 1.0:
#         trainloader = datautils.fetch_subset_dataloader('train', params)
#     else:
#         trainloader = datautils.fetch_dataloader('train', params)
#
#     testloader = datautils.fetch_dataloader('test', params)
#
#     logging.info("- done.")
#
#     model = resnet.ResNet18().cuda() if params.cuda else resnet.ResNet18()
#     optimizer = optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, weight_decay=5e-4)
#     # fetch loss function and metrics
#     loss_fn = utils.loss_function
#     metrics = utils.metrics
#
#     # Train the model
#     logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
#     train_and_evaluate(model, trainloader, testloader, optimizer, loss_fn, metrics, params, args.model_dir, args.restore_file)
Example #10
0
        data = json.load(f)
        best_val_acc = data['accuracy']
        f.close()

for epoch in range(args.max_epochs):
    train(train_set, train_set2, model, args, 'train')
    val_acc = val(val_set, val_set2, model, args, 'val')
    val_metrics = {'accuracy': val_acc}
    is_best = val_acc >= best_val_acc

    utils.save_checkpoint({'epoch': epoch + 1,
                           'state_dict': model.state_dict(),
                           'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=args.model_dir)

    if is_best:
        logging.info('- Found new best accuracy')
        counter = 0  # reset counter
        best_val_acc = val_acc

        best_json_path = os.path.join(
            args.model_dir, 'val_best_weights.json')
        utils.save_dict_to_json(val_metrics, best_json_path)
    else:
        counter += 1

    if counter > patience:
        logging.info('- No improvement in a while, stopping training...')
    last_json_path = os.path.join(
        args.model_dir, 'val_last_weights.json')
    utils.save_dict_to_json(val_metrics, last_json_path)
Example #11
0
def train_and_evaluate(model: nn.Module,
                       train_loader: DataLoader,
                       test_loader: DataLoader,
                       optimizer: optim, loss_fn,
                       params: utils.Params,
                       restore_file: str = None) -> None:
    '''Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''
    # reload weights from restore_file if specified
    restore_epoch = 0
    if restore_file is not None:
        restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar')
        logger.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        restore_epoch = int(restore_file[-2:].replace('_',''))+1
    logger.info('Restoring epoch: {}'.format(restore_epoch))
    logger.info('Begin training and evaluation')
    
    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=25, verbose=True, delta=0.0001, folder=params.model_dir)
    
    if os.path.exists(os.path.join(params.model_dir, 'metrics_test_best_weights.json')):
        with open(os.path.join(params.model_dir, 'metrics_test_best_weights.json')) as json_file:
            best_test_ND = json.load(json_file)['ND']
            early_stopping.best_score = best_test_ND
    else:
        best_test_ND = float('inf')
        early_stopping.best_score = best_test_ND
    
    train_len = len(train_loader)
    ND_summary = np.zeros(params.num_epochs)
    loss_summary = np.zeros((train_len * params.num_epochs))
    
    for epoch in range(restore_epoch, params.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(model, optimizer, loss_fn, train_loader,
                                                                        test_loader, params, epoch)
        test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling)
#         if test_metrics['ND'] == float('nan'):
#             test_metrics['ND'] = 1000
#             print('NAN ')

#         elif test_metrics['ND'] == np.nan:
#             print('NAN ')
#             test_metrics['ND'] = 1000
        
        ND_summary[epoch] = test_metrics['ND'] ##################################'ND'
        is_best = ND_summary[epoch] <= best_test_ND

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict': optimizer.state_dict()},
                              epoch=epoch,
                              is_best=is_best,
                              checkpoint=params.model_dir)

        if is_best:
            logger.info('- Found new best ND') ############# 'ND'
            best_test_ND = ND_summary[epoch]
            best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best ND is: %.5f' % best_test_ND) ## 'ND'

        utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir)
        utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir)

        last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)
        
        
        # early_stopping needs the validation loss to check if it has decresed, 
        # and if it has, it will make a checkpoint of the current model
        logger.info('ND : %.5f ' % test_metrics['ND'])
        early_stopping(test_metrics['ND'], model)
        
        if early_stopping.early_stop:
            logger.info('Early stopping')
            break
        
#     # load the last checkpoint with the best model
#     model.load_state_dict(torch.load('checkpoint.pt'))

    if args.save_best:
        f = open('./param_search.txt', 'w')
        f.write('-----------\n')
        list_of_params = args.search_params.split(',')
        print_params = ''
        for param in list_of_params:
            param_value = getattr(params, param)
            print_params += f'{param}: {param_value:.2f}'
        print_params = print_params[:-1]
        f.write(print_params + '\n')
        f.write('Best ND: ' + str(best_test_ND) + '\n')
        logger.info(print_params)
        logger.info(f'Best ND: {best_test_ND}')
        f.close()
        utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir)
        utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
Example #12
0
def train_and_evaluate2(model: nn.Module, train_loader: DataLoader,
                        test_loader: DataLoader, optimizer: optim,
                        params: utils.Params, loss_fn: None,
                        restore_file: None, args: None, idx: None) -> None:
    '''Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(params.model_dir,
                                    restore_file + '.pth.tar')
        logger.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
    logger.info('begin training and evaluation')
    best_test_ND = float('inf')

    # File to save first results
    out_file = os.path.join(os.path.join('experiments', args.model_name),
                            'train_results.csv')
    if not os.path.isfile(out_file):
        of_connection = open(out_file, 'w')
        writer = csv.writer(of_connection)
        # Write the headers to the file
        writer.writerow(['iteration', 'epoch', 'test_metric', 'train_loss'])
        of_connection.close()

    train_len = len(train_loader)
    ND_summary = np.zeros(params.num_epochs)
    loss_summary = np.zeros((train_len * params.num_epochs))

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=5,
                                   verbose=True,
                                   delta=0.0001,
                                   folder=params.model_dir)

    for epoch in range(params.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(
            model, optimizer, loss_fn, train_loader, test_loader, params,
            args.sampling, epoch)
        test_metrics = evaluate(model,
                                loss_fn,
                                test_loader,
                                params,
                                epoch,
                                sample=args.sampling)
        if test_metrics['rou50'] == float('nan'):
            test_metrics['rou50'] = 100
        elif test_metrics['rou50'] == 'nan':
            test_metrics['rou50'] = 100
        elif test_metrics['rou50'] == np.nan:
            test_metrics['rou50'] = 100

        ND_summary[epoch] = test_metrics['rou50']
        is_best = ND_summary[epoch] <= best_test_ND

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': 0,  #epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            epoch=0,  # to prevent extra model savings
            is_best=is_best,
            checkpoint=params.model_dir)

        if is_best:
            logger.info('- Found new best ND')
            best_test_ND = ND_summary[epoch]
            best_json_path = os.path.join(params.model_dir,
                                          'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best loss is: %.5f' % best_test_ND)

        #if args.plot_figure:
        #    utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir)
        #    utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir)

        last_json_path = os.path.join(params.model_dir,
                                      'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)
        # Write to the csv file ('a' means append)
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([idx, epoch + 1, test_metrics,
                         loss_summary[-1]])  #loss_summary[0]??
        of_connection.close()
        logger.info('Loss_summary: ' %
                    loss_summary[epoch * train_len:(epoch + 1) * train_len])

        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        logger.info('test_metrics[rou50]: %.5f ' % test_metrics['rou50'])
        early_stopping(test_metrics['rou50'], model)

        if early_stopping.early_stop:
            logger.info('Early stopping')
            break

    with open(best_json_path) as json_file:
        best_metrics = json.load(json_file)
    return best_metrics, test_metrics
Example #13
0
def train_and_evaluate(netG,
                       netD,
                       train_dataloader,
                       val_dataloader,
                       optimG,
                       optimD,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None,
                       cuda_id=0):
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path_g = os.path.join(args.model_dir, 'best_g' + '.pth.tar')
        restore_path_d = os.path.join(args.model_dir, 'best_d' + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path_g))
        utils.load_checkpoint(restore_path_g, netG, optimG)
        utils.load_checkpoint(restore_path_d, netD, optimD)

    best_val_acc = 0.0
    # train add logger,epoch two parameters
    #     logger = Logger('./logs')

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(netG, netD, optimG, optimD, loss_fn, train_dataloader, metrics,
              params, cuda_id)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(netG, netD, loss_fn, val_dataloader, metrics,
                               params, cuda_id)
        #print ('after val --------')

        val_acc = val_metrics['PSNR']
        is_best = val_acc >= best_val_acc

        #Save weights
        # save G
        flag = 'G'
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': netG.state_dict(),
                'optim_dict': optimG.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir,
            flag=flag)
        flag = 'D'
        # save D
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': netD.state_dict(),
                'optim_dict': optimD.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir,
            flag=flag)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)

        if epoch % 100 == 0 and epoch > 99:
            plt.plot(global_loss_g)
            plt.savefig(str(epoch) + " epoch_g.jpg")
            plt.plot(global_loss_d)
            plt.savefig(str(epoch) + " epoch_d.jpg")

    plt.plot(global_loss_g)
    plt.savefig("final loss_g.jpg")
    plt.plot(global_loss_d)
    plt.savefig("final loss_d.jpg")
Example #14
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None,
                       cuda_id=0):

    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0
    '''
    # train add logger,epoch two parameters
    '''
    logger = Logger('./logs')
    scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)

    for epoch in range(params.num_epochs):
        # Run one epoch
        #         scheduler.step()
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params,
              logger, epoch, cuda_id)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params,
                               cuda_id)

        val_acc = val_metrics['PSNR']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)

    plt.plot(global_loss)
    plt.savefig("final loss.jpg")
Example #15
0
def train_evaluate(model,
                   train_iterator,
                   valid_iterator,
                   criterion,
                   optimizer,
                   metrics,
                   args,
                   restore_file=None):
    """
    
    """
    if os.path.exists(args.exp_dir) == False:
        os.makedirs(args.exp_dir)

    if restore_file is not None:
        restore_path = os.path.join(args.exp_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}...".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    # For early stopping
    n_worse = 0
    min_valid_loss = float('inf')
    max_valid_f1 = -float('inf')

    # Create args and output dictionary (for json output)
    output_dict = {'args': vars(args), 'prfs': {}}

    for epoch in range(args.num_epochs):

        train_scores = train(model, train_iterator, criterion, optimizer,
                             metrics, args.threshold)
        valid_scores = evaluate(model, valid_iterator, criterion, metrics,
                                args.threshold)

        # Update output dictionary
        output_dict['prfs'][str('train_' + str(epoch + 1))] = train_scores
        output_dict['prfs'][str('valid_' + str(epoch + 1))] = valid_scores

        # Save scores
        if valid_scores['loss'] < min_valid_loss:
            min_valid_loss = valid_scores['loss']
        if valid_scores['f1'] > max_valid_f1:
            max_valid_f1 = valid_scores['f1']

        is_best = (valid_scores['loss'] - min_valid_loss <= args.stop_c1) and (
            max_valid_f1 - valid_scores['f1'] <= args.stop_c2)
        if is_best == True:
            utils.save_dict_to_json(
                valid_scores, os.path.join(args.exp_dir,
                                           'best_val_scores.json'))

        # Save model
        if args.save_model == True:
            utils.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optim_Dict': optimizer.state_dict()
                },
                is_best=is_best,
                checkdir=args.exp_dir)

        # Save the latest valid scores in exp_dir
        # utils.save_dict_to_json(valid_scores, os.path.join(exp_dir, 'last_val_scores.json'))

        print("\n\nEpoch {}/{}...".format(epoch + 1, args.num_epochs))
        print(
            '\n[Train] loss: {0:.3f} | acc: {1:.2f}% | f1: {2:.2f}% | recall: {3:.2f}% | precision: {4:.2f}% | specificity: {5:.2f}%'
            .format(train_scores['loss'], train_scores['accuracy'] * 100,
                    train_scores['f1'] * 100, train_scores['recall'] * 100,
                    train_scores['precision'] * 100,
                    train_scores['specificity'] * 100))
        print(
            '[Val] loss: {0:.3f} | acc: {1:.2f}% | f1: {2:.2f}% | recall: {3:.2f}% | precision: {4:.2f}% | specificity: {5:.2f}%\n'
            .format(valid_scores['loss'], valid_scores['accuracy'] * 100,
                    valid_scores['f1'] * 100, valid_scores['recall'] * 100,
                    valid_scores['precision'] * 100,
                    valid_scores['specificity'] * 100))

        # Early stopping
        if (valid_scores['loss'] - min_valid_loss > args.stop_c1) and (
                max_valid_f1 - valid_scores['f1'] > args.stop_c2):
            n_worse += 1
        if n_worse == args.stop_p:
            print("Early stopping")
            break

    # Write performance and args to json
    prfs_name = os.path.basename(args.exp_dir) + '_prfs.json'
    prfs_path = os.path.join(args.exp_dir, prfs_name)
    with open(prfs_path, 'w') as fout:
        json.dump(output_dict, fout, indent=4)
Example #16
0
def train_and_evaluate_kd(model, teacher_model, trainloader, validloader, optimizer, criterion_kd, metrics, params, model_dir, restore_file=None):
    """
    Train the model and evaluate every epoch
    :param model: (torch.nn.Module) the neural network
    :param teacher_model: (Params) hyperparameters
    :param model_dir: (string) directory containing config, weights and log
    :param restore_file: (string) - file to restore (without its extension .ptr.tar)
    """
    # Reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(model_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_valid_acc = 0.0

    # TensorBoard logger setup
    # board_logger = utils.Board_logger(os.path.join(model_dir, 'board_logs'))

    # Learning rate scedulers for different models:
    if params.model_version == "resnet18_distill":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.1)
    elif params.model_version == "cnn_distill":
        # For cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

    for epoch in range(params.num_epochs):
        scheduler.step()

        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch+1, params.num_epochs))

        # Compute number of batches in one epoch (one full pass over the training set
        train_kd(model, teacher_model, optimizer, criterion_kd, trainloader, metrics, params)

        # Evaluate for one epoch on validation set
        valid_metrics = evaluate_kd(model, validloader, metrics, params)

        valid_acc = valid_metrics['accuracy']
        is_best = valid_acc >= best_valid_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                         'state_dict': model.state_dict(),
                         'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_valid_acc = valid_acc

            # Save best valid metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_valid_best_weights.json")
            utils.save_dict_to_json(valid_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_valid_last_weights.json")
        utils.save_dict_to_json(valid_metrics, last_json_path)

        #============ TensorBoard logging: uncomment below to turn in on ============#
        # # (1) Log the scalar values
        # info = {
        #     'valid accuracy': valid_acc
        # }

        # for tag, value in info.items():
        #     board_logger.scalar_summary(tag, value, epoch+1)

        # # (2) Log values and gradients of the parameters (histogram)
        # for tag, value in model.named_parameters():
        #     tag = tag.replace(',', '/')
        #     board_logger.histo_summary(tag, value.data.cpu().numpy(), epoch+1)
        #     # board_logger.histo_summary(tag+'/grad', value.grad.data.cpu().numpy(), epoch+1)
Example #17
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       critierion,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None):
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0
    best_val_metrics = []
    learning_rate_0 = params.learning_rate
    train_acc_series = []
    val_acc_series = []
    train_loss_series = []

    for epoch in range(params.num_epochs):
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # train model
        train_metrics = train(model, train_dataloader, optimizer, critierion,
                              metrics, params)

        # learning rate exponential decay
        params.learning_rate = learning_rate_0 * np.exp(
            -params.exp_decay_k * epoch)

        # evaluate
        val_metrics = evaluate(model, critierion, val_dataloader, metrics,
                               params)

        # find accuracy from validation dataset
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # save accuracy / loss to array for plot
        train_acc_series.append(train_metrics['accuracy'])
        val_acc_series.append(val_metrics['accuracy'])
        train_loss_series.append(train_metrics['loss'])

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc
            best_val_metrics = val_metrics

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
        print('******************************************')

    # plot visualized performance
    visualize.plot_train_val_accuracy(train_acc_series, val_acc_series)
    visualize.plot_loss(train_loss_series)
    # save best validation F1 score plot
    visualize.plot_individual_label_f1score(best_val_metrics)