Example #1
0
def main():
    # Set the random seed for reproducible experiments
    torch.manual_seed(230)

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', help="Directory containing the dataset")
    parser.add_argument('--model_dir', help="Directory containing params.json")
    parser.add_argument('--params', help='Directory containing params.json')
    parser.add_argument('--restore_file',
                        default='best',
                        help="name of the file in --model_dir \
                         containing weights to load")

    params = utils.Params(args.params)

    # Get the logger
    utils.set_logger(os.path.join(params.model_dir, 'evaluate.log'))

    # Create the input data pipeline
    logging.info("Creating the dataset...")

    test_dataset = dataset(file_path=params.metadata_file,
                           split="Test",
                           classes=params.classes)

    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=params.batch_size,
                             shuffle=True,
                             num_workers=8)

    logging.info("- done.")

    # Define the model and optimizer
    if model != "Inception":
        net = importlib.import_module("features.models.{}".format(
            params.model))
        model = net.Net()
        inception = False
    else:
        model = models.inception_v3(pretrained=False)
        model.fc = nn.Linear(2048, num_classes)
        model.AuxLogits.fc = nn.Linear(768, 1)
        inception = True

    model.cuda()

    metrics_save = metrics_code.metrics_save

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(
        os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)

    # Evaluate
    test_metrics = evaluate(model, test_loader, metrics_save, experiment,
                            inception)
    save_path = os.path.join(model_dir,
                             "metrics_test_{}.json".format(restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
Example #2
0
    def test(self, restore_from):
        """Test the model
        Args:
            restore_from: (string) directory or file containing weights to restore the graph
        """
        hp = self.hp
        experiment_dir = hp.experiment_dir

        split = DatasetSplit.TEST
        model_spec = self._get_model_spec(split)

        # Initialize tf.Saver
        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Initialize the lookup table
            sess.run(model_spec['variable_init_op'])

            # Reload weights from the weights subdirectory
            save_path = os.path.join(experiment_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
            saver.restore(sess, save_path)

            # Evaluate
            num_steps = (hp.test_size + hp.batch_size - 1) // hp.batch_size
            metrics = self.evaluate_epoch(sess, model_spec, num_steps)

            loss_string, acc_string = self.metrics_string(metrics)
            tf.logging.info("- Test metrics: " + acc_string)
            tf.logging.info("- Test metrics: " + loss_string)

            metrics_name = '_'.join(restore_from.split('/'))
            save_path = os.path.join(
                experiment_dir, "metrics_test_{}.json".format(metrics_name))
            utils.save_dict_to_json(metrics, save_path)
Example #3
0
    # Get the logger
    utils.set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Create the input data pipeline
    logging.info("Creating the dataset...")

    # fetch dataloaders
    dataloaders = data_loader.fetch_data_loader(['val'], args.data_dir, params)
    test_dl = dataloaders['val']

    logging.info("getting the test dataloader - done.")

    # Define the model
    model = PhdGifNet().cuda() if params.cuda else PhdGifNet()

    loss_fn = loss_fn
    metrics = metrics

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(
        os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)

    # Evaluate
    test_metrics = evaluate(model, loss_fn, test_dl, metrics, params)
    save_path = os.path.join(args.model_dir,
                             "metrics_test_{}.json".format(args.restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
Example #4
0
def train_and_evaluate(model,
                       ad_net,
                       grl,
                       ad_net_m,
                       grl_m,
                       Myacc,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       logger,
                       restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) - name of file to restore from (without its extension .pth.tar)
    """

    best_val_acc = 0.0
    # reload weights from restore_file if specified
    #if args.finetune:
    #    num_ftrs = model.fc8.in_features
    #    model.fc8 = nn.Linear(num_ftrs, 60)
    #    model = model.cuda()
    #logger.info(model)
    if restore_file is not None:
        logging.info("Restoring parameters from {}".format(restore_file))
        checkpoint = utils.load_checkpoint(restore_file, model, optimizer)
        params.start_epoch = checkpoint['epoch']

        best_val_acc = checkpoint['best_val_acc']
        print('best_val_acc=', best_val_acc)
        print(optimizer.state_dict()['param_groups'][0]['lr'],
              checkpoint['epoch'])

    # learning rate schedulers for different models:
    if params.lr_decay_type == None:
        logging.info("no lr decay")
    else:
        assert params.lr_decay_type in ['multistep', 'exp', 'plateau']
        logging.info("lr decay:{}".format(params.lr_decay_type))
    if params.lr_decay_type == 'multistep':
        scheduler = MultiStepLR(optimizer,
                                milestones=params.lr_step,
                                gamma=params.scheduler_gamma,
                                last_epoch=params.start_epoch - 1)

    elif params.lr_decay_type == 'exp':
        scheduler = ExponentialLR(optimizer,
                                  gamma=params.scheduler_gamma2,
                                  last_epoch=params.start_epoch - 1)
    elif params.lr_decay_type == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=params.scheduler_gamma3,
                                      patience=params.patience,
                                      verbose=False,
                                      threshold=0.0001,
                                      threshold_mode='rel',
                                      cooldown=0,
                                      min_lr=0,
                                      eps=1e-08)
    if args.finetune:
        num_ftrs = model.fc8.in_features
        model.fc8 = nn.Linear(num_ftrs, 30)
        model = model.cuda()
    logger.info(model)
    for epoch in range(params.start_epoch, params.num_epochs):
        params.current_epoch = epoch
        if params.lr_decay_type != 'plateau':
            scheduler.step()

        # Run one epoch
        logger.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train_metrics, train_confusion_meter = train(model, ad_net, grl,
                                                     ad_net_m, grl_m,
                                                     optimizer, loss_fn,
                                                     train_dataloader, metrics,
                                                     params, logger)

        # Evaluate for one epoch on validation set
        val_metrics, val_confusion_meter = evaluate(model, loss_fn,
                                                    val_dataloader, metrics,
                                                    params, logger)

        # vis logger
        accs = [
            100. * (1 - train_metrics['accuracytop1']),
            100. * (1 - train_metrics['accuracytop5']),
            100. * (1 - val_metrics['accuracytop1']),
            100. * (1 - val_metrics['accuracytop5']),
        ]
        error_logger15.log([epoch] * 4, accs)
        Myacc.append(100. * (1 - val_metrics['accuracytop1']))

        losses = [train_metrics['loss'], val_metrics['loss']]
        loss_logger.log([epoch] * 2, losses)
        train_confusion_logger.log(train_confusion_meter.value())
        test_confusion_logger.log(val_confusion_meter.value())

        # log split loss
        if epoch == params.start_epoch:
            loss_key_train = []
            loss_key_val = []
            for key in [k for k, v in train_metrics.items()]:
                if 'ls' in key: loss_key_train.append(key)
            for key in [k for k, v in val_metrics.items()]:
                if 'ls' in key: loss_key_val.append(key)
            loss_split_key = ['train_' + k for k in loss_key_train
                              ] + ['val_' + k for k in loss_key_val]
            loss_logger_split.opts['legend'] = loss_split_key

        loss_split = [train_metrics[k] for k in loss_key_train
                      ] + [val_metrics[k] for k in loss_key_val]
        loss_logger_split.log([epoch] * len(loss_split_key), loss_split)

        if params.lr_decay_type == 'plateau':
            scheduler.step(val_metrics['ls_all'])

        val_acc = val_metrics['accuracytop1']
        is_best = val_acc >= best_val_acc
        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict(),
                'best_val_acc': best_val_acc
            },
            epoch=epoch + 1,
            is_best=is_best,
            save_best_ever_n_epoch=params.save_best_ever_n_epoch,
            checkpointpath=params.experiment_path + '/checkpoint',
            start_epoch=params.start_epoch)

        val_metrics['best_epoch'] = epoch + 1
        # If best_eval, best_save_path, metric
        if is_best:
            logger.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(params.experiment_path,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(params.experiment_path,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model_source, model_target, transfer, train_dl, val_dl_source, val_dl_target, opt, loss_fn, metrics, params,
                       lr_scheduler, checkpoint_dir, ckpt_filename, log_dir, writer):

    ckpt_file_path = os.path.join(checkpoint_dir, ckpt_filename)
    best_value = -float('inf')
    early_stopping = utils.EarlyStopping(patience=10, verbose=True)
    start_epoch = 0

    batch_sample_source, batch_gt_source = next(iter(val_dl_source))
    batch_sample_target, batch_gt_target = next(iter(val_dl_target))

    if os.path.exists(ckpt_file_path):
        model, opt, lr_scheduler, start_epoch, best_value = utils.load_checkpoint(transfer, opt, lr_scheduler,
                                                                start_epoch, False, best_value, checkpoint_dir, ckpt_filename)
        print("=> loaded transfer checkpoint form {} (epoch {})".format(
            ckpt_file_path, start_epoch))
    else:
        print("=> Initializing from scratch")

    source_encoder = model_source.backbone
    target_encoder = model_target.backbone
    target_decoder = model_target.classifier

    adpative_model = get_adaptive_network(source_encoder, transfer, target_decoder)

    for epoch in range(start_epoch, params.num_epochs):
        # Run one epoch
        current_lr = get_lr(opt)
        logging.info('Epoch {}/{}, current lr={}'.format(epoch, params.num_epochs-1, current_lr))
        writer.add_scalar('Learning_rate', current_lr, epoch)

        transfer.train()
        train_loss, train_metrics = train_epoch(
            source_encoder, target_encoder, transfer, loss_fn, train_dl, opt, lr_scheduler, params=params)
    
        transfer.eval()
        val_loss_source, _ = train_epoch(
            source_encoder, target_encoder, transfer, loss_fn, val_dl_source, params=params)

        # Evaluate for one epoch on validation set
        _, val_metrics_source = evaluate(
            adpative_model, None, val_dl_source, metrics=metrics, params=params)

        _, val_metrics_target = evaluate(
            adpative_model, None, val_dl_target, metrics=metrics, params=params)

        writer.add_scalars('Loss', {
            'Training': train_loss,
            'Validation': val_loss_source,
        }, epoch)

        for (val_metric_name_s, val_metric_results_s), (val_metric_name_t, val_metric_results_t) in zip(val_metrics_source.items(), val_metrics_target.items()):
            writer.add_scalars(val_metric_name_s, {
                'Validation_source': val_metric_results_s[0],
                'Validation_target': val_metric_results_t[0],
            }, epoch)

        if epoch % 5 == 0 or epoch==params.num_epochs-1:
            predictions = inference(adpative_model, batch_sample_source)
            plot = train_dl.dataset.get_predictions_plot(
                batch_sample_source, predictions.cpu(), batch_gt_source)
            writer.add_image('Predictions_source', plot, epoch, dataformats='HWC')

            predictions = inference(adpative_model, batch_sample_target)
            plot = train_dl.dataset.get_predictions_plot(
                batch_sample_target, predictions.cpu(), batch_gt_target)
            writer.add_image('Predictions_target', plot, epoch, dataformats='HWC')

        current_value = list(val_metrics_source.values())[0][0]
        is_best = current_value >= best_value

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_value = current_value
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(
                log_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics_source, best_json_path)
            utils.save_dict_to_json(val_metrics_target, best_json_path)

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': transfer.state_dict(),
                               'optim_dict': opt.state_dict(),
                               'scheduler_dict': lr_scheduler.state_dict(),
                               'best_value': best_value},
                              is_best=is_best,
                              ckpt_dir=checkpoint_dir,
                              filename=ckpt_filename)

        logging.info("\ntrain loss: %.3f, val loss: %.3f" %
                     (train_loss, val_loss_source))
        
        for (val_metric_name_s, val_metric_results_s), (val_metric_name_t, val_metric_results_t) in zip(val_metrics_source.items(), val_metrics_target.items()):
            logging.info("source %s: %.3f, target %s: %.3f" % (val_metric_name_s, val_metric_results_s[0], val_metric_name_t, val_metric_results_t[0]))
        logging.info("-"*20)

        early_stopping(val_loss_source)
        if early_stopping.early_stop:
            logging.info("Early stopping")
            break
Example #6
0
        torch.cuda.manual_seed(seed)

    # fetch dataloaders
    # fetch dataloaders
    val_dl = dataloader.fetch_dataloader(args.data_dir, args.txt_val, "val",
                                         params)
    # Define the model
    model = get_network(params).to(params.device)

    #num_classes+1 for background.
    metrics = OrderedDict({})
    for metric in params.metrics:
        metrics[metric] = get_metrics(metric, params)

    # Reload weights from the saved file
    model = utils.load_checkpoint(model,
                                  is_best=True,
                                  checkpoint_dir=args.checkpoint_dir)[0]

    # Evaluate
    eval_loss, val_metrics = evaluate(model,
                                      val_dl,
                                      loss_fn=None,
                                      metrics=metrics,
                                      params=params)

    best_json_path = os.path.join(args.model_dir, "logs/evaluation.json")
    for val_metric_name, val_metric_results in val_metrics.items():
        print("{}: {}".format(val_metric_name, val_metric_results))
    utils.save_dict_to_json(val_metrics, best_json_path)
def train_and_evaluate(model_source, model_target, transfer, train_dl_all,
                       val_dl_all, val_dl_target, opt1, opt2, opt3, loss_fn1,
                       loss_fn2, metrics_depth, metrics_segmentation, params,
                       lr_scheduler1, lr_scheduler2, lr_scheduler3,
                       checkpoint_dir_source, checkpoint_dir_target,
                       checkpoint_dir_transfer, ckpt_filename, log_dir,
                       writer):

    ckpt_file_path = os.path.join(checkpoint_dir_transfer, ckpt_filename)
    best_value = -float('inf')
    start_epoch = 0

    batch_sample_carla, batch_gt_carla_sem, batch_gt_carla_depth, _, _ = next(
        iter(val_dl_all))
    batch_sample_cs, batch_gt_cs = next(iter(val_dl_target))

    if os.path.exists(ckpt_file_path):
        transfer, opt3, lr_scheduler3, start_epoch, best_value = utils.load_checkpoint(
            transfer, opt3, lr_scheduler3, start_epoch, False, best_value,
            checkpoint_dir_transfer, ckpt_filename)
        print("=> loaded transfer checkpoint form {} (epoch {})".format(
            ckpt_file_path, start_epoch))
    else:
        print("=> Initializing transfer from scratch")

    source_encoder = model_source.backbone
    target_decoder = model_target.classifier
    adpative_model = get_adaptive_network(source_encoder, transfer,
                                          target_decoder)

    for epoch in range(start_epoch, params.num_epochs):
        # Run one epoch
        current_lr = get_lr(opt3)
        logging.info('Epoch {}/{}, current lr={}'.format(
            epoch, params.num_epochs - 1, current_lr))
        writer.add_scalar('Learning_rate', current_lr, epoch)

        transfer.train()
        train_loss_depth, train_loss_segmentation = train_epoch(
            model_source, model_target, transfer, train_dl_all, opt1, opt2,
            opt3, loss_fn1, loss_fn2, params, lr_scheduler1, lr_scheduler2,
            lr_scheduler3)

        writer.add_scalars(
            'Losses', {
                'Training_depth': train_loss_depth,
                'Training_segmentation': train_loss_segmentation,
            }, epoch)

        # if epoch % 5 == 0 or epoch==params.num_epochs-1:
        predictions_sem = inference(model_target, batch_sample_carla)
        predictions_depth = inference(model_source, batch_sample_carla)

        plot = train_dl_all.dataset.get_predictions_plot(
            batch_sample_carla, predictions_sem.cpu(),
            batch_gt_carla_sem.cpu(), predictions_depth.cpu(),
            batch_gt_carla_depth.cpu())
        writer.add_image('Predictions_carla', plot, epoch, dataformats='HWC')

        predictions = inference(adpative_model, batch_sample_cs)
        plot = val_dl_target.dataset.dataset.get_predictions_plot(
            batch_sample_cs, predictions.cpu(), batch_gt_cs)
        writer.add_image('Predictions_target', plot, epoch, dataformats='HWC')

        val_metrics_depth, val_metrics_segmentation = evaluate_source(
            model_source, model_target, val_dl_all, metrics_depth,
            metrics_segmentation, params)

        _, val_metrics_transfer = evaluate(adpative_model,
                                           None,
                                           val_dl_target,
                                           metrics=metrics_segmentation,
                                           params=params)

        for (val_metric_name, val_metric_results) in val_metrics_depth.items():
            writer.add_scalar(val_metric_name, val_metric_results[0], epoch)

        for (val_metric_name,
             val_metric_results) in val_metrics_segmentation.items():
            writer.add_scalar(val_metric_name + '_target',
                              val_metric_results[0], epoch)

        for (val_metric_name,
             val_metric_results) in val_metrics_transfer.items():
            writer.add_scalar(val_metric_name + '_transfer',
                              val_metric_results[0], epoch)

        current_value = list(val_metrics_transfer.values())[0][0]
        is_best = current_value >= best_value

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best value")
            best_value = current_value
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(log_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics_transfer, best_json_path)

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model_source.state_dict(),
                'optim_dict': opt1.state_dict(),
                'scheduler_dict': lr_scheduler1.state_dict(),
                'best_value': best_value
            },
            is_best=is_best,
            ckpt_dir=checkpoint_dir_source,
            filename=ckpt_filename)

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model_target.state_dict(),
                'optim_dict': opt2.state_dict(),
                'scheduler_dict': lr_scheduler2.state_dict(),
                'best_value': best_value
            },
            is_best=is_best,
            ckpt_dir=checkpoint_dir_target,
            filename=ckpt_filename)

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': transfer.state_dict(),
                'optim_dict': opt3.state_dict(),
                'scheduler_dict': lr_scheduler3.state_dict(),
                'best_value': best_value
            },
            is_best=is_best,
            ckpt_dir=checkpoint_dir_transfer,
            filename=ckpt_filename)

        logging.info(
            "\ntrain loss depth: %.3f, train loss segmentation: %.3f" %
            (train_loss_depth, train_loss_segmentation))

        for (val_metric_name, val_metric_results) in val_metrics_depth.items():
            logging.info("val depth %s: %.3f" %
                         (val_metric_name, val_metric_results[0]))

        for (val_metric_name,
             val_metric_results) in val_metrics_segmentation.items():
            logging.info("val segmentation target %s: %.3f" %
                         (val_metric_name, val_metric_results[0]))

        for (val_metric_name,
             val_metric_results) in val_metrics_transfer.items():
            logging.info("val segmentation transfer %s: %.3f" %
                         (val_metric_name, val_metric_results[0]))

        logging.info("-" * 20)
Example #8
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       metrics_save,
                       model_dir,
                       num_epochs,
                       loss_func,
                       optimizer,
                       learning_rate,
                       decay,
                       save_summary_steps,
                       experiment=None,
                       inception=False,
                       restore_file=None):
    """
    Train the model and evaluate every epoch.

    model: (torch.nn.Module) the neural network
    train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
                        fetches training data
    val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
                        fetches validation data
    metrics: (dict) a dictionary of functions that compute a metric using
                        the output and labels of each batch
    model_dir: (string) directory containing config, weights and log
    restore_file: (string) optional- name of file to restore from (without its
                        extension .pth.tar)
    """

    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0
    best_val_auc = 0.0

    loss_func = eval(loss_func)
    optimizer = eval(optimizer)

    for epoch in range(num_epochs):

        logging.info("Epoch {}/{}".format(epoch + 1, num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, train_dataloader, metrics_save, loss_func, optimizer,
              save_summary_steps, experiment, inception)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, val_dataloader, metrics_save, loss_func,
                               experiment, inception)
        val_acc = val_metrics['test_accuracy']
        val_auc = val_metrics['test_AUC']

        is_best = val_acc >= best_val_acc and val_auc >= best_val_auc

        # If best_eval and auc, best_save_path
        if is_best:
            logging.info("- Found new best accuracy or auc")
            best_val_acc = val_acc
            best_val_auc = val_auc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Example #9
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fns,
                       scheduler,
                       evaluator,
                       writer,
                       params,
                       model_dir,
                       name,
                       restore_file=None):
    if restore_file is not None:
        restore_path = os.path.join(
            args.model_dir,
            args.model_type + '_' + args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, train_dataloader, optimizer, loss_fns, scheduler,
              evaluator, writer, epoch, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, val_dataloader, loss_fns, evaluator,
                               writer, epoch, params)

        val_acc = val_metrics['mIOU']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir,
            name=name)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
    def train_and_eval(self, params, restore_from=None):
        """Train the model and evaluate every epoch.
        Args:
            train_model_spec: (dict) contains the graph operations or nodes needed for training
            params: (Params) contains hyperparameters of the model.
                    Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
            train_ds: training dataset
            eval_ds: evaluation dataset
            log_dir: directory for log
            restore_from: (string) directory or file containing weights to restore the graph
        """
        # set up the train summary writer
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = os.path.join(self.log_dir, current_time,
                                     'train_summaries')
        eval_log_dir = os.path.join(self.log_dir, current_time,
                                    'eval_summaries')

        checkpoint_dir = os.path.join(self.log_dir, current_time,
                                      "training_checkpoints", 'ckpt')
        model_dir = os.path.join(self.log_dir, current_time)

        train_summary_writer = tf.summary.create_file_writer(train_log_dir)
        eval_summary_writer = tf.summary.create_file_writer(eval_log_dir)

        begin_at_epoch = 0
        best_eval_acc = 100.0

        # TRAINING MAIN LOOP
        # ----------------------------------------------------------------------
        print("[INFO] training started ...")
        # loop over the number of epochs
        epochStart = time.time()
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):

            step = 0
            # sys.stdout.flush()
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps_train = int(
                np.ceil(params.train_size / params.batch_size))
            num_steps_eval = int(np.ceil(params.eval_size / params.batch_size))
            # Use tqdm for progress bar
            with tqdm(total=num_steps_train,
                      desc="[INFO] Epoch {0:d}".format(epoch + 1)) as pbar:
                # loop over the data in batch size increments
                # ----------------------------------------------------------------------
                # TRAIN SESSION
                for x_train, y_train in self.train_ds.take(num_steps_train):
                    train_loss, logits = self.train_step(x_train, y_train)
                    # Log the loss in the tqdm progress bar
                    sleep(0.1)
                    # Display metrics at the end of each epoch.
                    metrics = {
                        "Train_MSE":
                        '{:04.2f}'.format(
                            self.train_accuracy_mse.result().numpy()),
                        "Train_Loss":
                        '{:04.2f}'.format(self.train_loss.result().numpy())
                    }
                    pbar.set_postfix(metrics)
                    pbar.update()

                    # record train summary for tensor board
                    # if 0 < step < 30:
                    with train_summary_writer.as_default():
                        tf.summary.image('training images',
                                         x_train,
                                         step=epoch + step + 1,
                                         max_outputs=5)
                        tf.summary.image('logit images',
                                         logits,
                                         step=epoch + step + 1,
                                         max_outputs=5)
                        tf.summary.image('label images',
                                         y_train,
                                         step=epoch + step + 1,
                                         max_outputs=5)
                    step = step + 1

                with train_summary_writer.as_default():
                    tf.summary.scalar('loss',
                                      self.train_loss.result(),
                                      step=epoch + 1)
                    tf.summary.scalar('mse',
                                      self.train_accuracy_mse.result(),
                                      step=epoch + 1)
        # ----------------------------------------------------------------------
        # EVALUATION SESSION
        # loop over the eval data in batch size increments
                for x_eval, y_eval in self.eval_ds.take(num_steps_eval):
                    eval_loss = self.test_step(x_eval, y_eval)
                    # Display metrics at the end of each epoch.
                    metrics["Eval_MSE"] = '{:04.2f}'.format(
                        self.test_accuracy_mse.result().numpy())
                    pbar.set_postfix(metrics)
                pbar.close()
                # record train summary for tensor board
                with eval_summary_writer.as_default():
                    tf.summary.scalar('mse',
                                      self.test_accuracy_mse.result(),
                                      step=epoch + 1)
        # ----------------------------------------------------------------------
            metrics["Epoch"] = '{0:d}'.format(epoch + 1)
            # If best_eval, save the model at best_save_path
            eval_acc = self.test_accuracy_mse.result().numpy()
            if params.save_model:
                if eval_acc <= best_eval_acc:
                    # Store new best accuracy
                    best_eval_acc = eval_acc
                    # Save weights
                    best_save_path = os.path.join(
                        model_dir, "model_{0:d}".format(epoch + 1))
                    tf.keras.models.save_model(self.model,
                                               best_save_path,
                                               save_format="h5")
                    print(
                        "[INFO] Found new best accuracy, saving in {}".format(
                            best_save_path))
                    # Save best eval metrics in a json file in the model directory
                    best_json_path = os.path.join(
                        model_dir, "metrics_eval_best_weights.json")
                    save_dict_to_json(metrics, best_json_path)

            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            # ----------------------------------------------------------------------
            # Reset training metrics at the end of each epoch
            self.train_loss.reset_states()
            self.train_accuracy_mse.reset_states()
            self.train_accuracy_kld.reset_states()

            self.test_accuracy_mse.reset_states()
            self.test_accuracy_kld.reset_states()
        # end of train and eval
        # show timing information for the epoch
        epochEnd = time.time()
        elapsed = (epochEnd - epochStart) / 60.0
        print("[INFO] Took {:.4} minutes".format(elapsed))
        # ----------------------------------------------------------------------
        if params.save_model:
            reconstructed_best_model = tf.keras.models.load_model(
                best_save_path)
            reconstructed_best_model.compile(optimizer=self.opt,
                                             loss=self.loss_object)
            best_final_path = os.path.join(model_dir, "best_full_model_path")
            tf.saved_model.save(reconstructed_best_model, best_final_path)
            print("[INFO] Final model save in {}".format(best_final_path))

        print("[INFO] Training done and log saved in {} ".format(model_dir))
Example #11
0
    config = get_config_from_json(config_path)
    datadir = '../data_samples/ner/'
    words_path = datadir + 'corpus_words.txt'
    tags_path = datadir + 'corpus_tags.txt'
    X, Y = [], []
    with Path(words_path).open('rb') as f:
        for l in f:
            X.append(l.strip().split())
    word_vocab = build_vocab(X)
    with Path(tags_path).open('rb') as f:
        for l in f:
            Y.append(l.strip().split())
    tag_vocab = build_vocab(Y)
    # add padding token
    if PAD_WORD not in word_vocab: word_vocab.add(PAD_WORD)
    if PAD_TAG not in tag_vocab: tag_vocab.add(PAD_TAG)
    # save to disk
    word_vocab_path = datadir + 'word_vocab.txt'
    tag_vocab_path = datadir + 'tag_vocab.txt'
    with Path(word_vocab_path).open('w') as f:
        f.write('\n'.join(word for word in word_vocab))
    with Path(tag_vocab_path).open('w') as f:
        f.write('\n'.join(tag for tag in tag_vocab))
    # save json config
    word_vocab_size = len(word_vocab)
    tag_vocab_size = len(tag_vocab)
    config = update_config_by_vocab(config, word_vocab_size, tag_vocab_size)
    save_dict_to_json(config, config_path)
    print("updated config file by updating vocabulary")

Example #12
0
def train_and_evaluate(model, train_dl, val_dl, opt, loss_fn, metrics, params,
                       lr_scheduler, checkpoint_dir, ckpt_filename, log_dir,
                       writer):

    # todo restore best checkpoint
    ckpt_file_path = os.path.join(checkpoint_dir, ckpt_filename)
    early_stopping = utils.EarlyStopping(patience=10, verbose=True)
    best_value = -float('inf')
    start_epoch = 0

    batch_sample_train, batch_gt_train = next(iter(train_dl))
    batch_sample_val, batch_gt_val = next(iter(val_dl))

    if os.path.exists(ckpt_file_path):
        model, opt, lr_scheduler, start_epoch, best_value = utils.load_checkpoint(
            model, opt, lr_scheduler, start_epoch, False, best_value,
            checkpoint_dir, ckpt_filename)
        print("=> loaded checkpoint form {} (epoch {})".format(
            ckpt_file_path, start_epoch))
    else:
        print("=> Initializing from scratch")

    for epoch in range(start_epoch, params.num_epochs - 1):
        # Run one epoch
        current_lr = get_lr(opt)
        logging.info('Epoch {}/{}, current lr={}'.format(
            epoch, params.num_epochs - 1, current_lr))

        writer.add_scalar('Learning_rate', current_lr, epoch)

        if epoch % 5 == 0:
            predictions = inference(model, batch_sample_train)
            plot = train_dl.dataset.get_predictions_plot(
                batch_sample_train, predictions.cpu(), batch_gt_train)
            writer.add_image('Predictions_train',
                             plot,
                             epoch,
                             dataformats='HWC')

            predictions = inference(model, batch_sample_val)
            plot = train_dl.dataset.get_predictions_plot(
                batch_sample_val, predictions.cpu(), batch_gt_val)
            writer.add_image('Predictions_val', plot, epoch, dataformats='HWC')

        model.train()
        train_loss, train_metrics = train_epoch(model, loss_fn, train_dl, opt,
                                                lr_scheduler, metrics, params)

        # Evaluate for one epoch on validation set
        val_loss, val_metrics = evaluate(model,
                                         val_dl,
                                         loss_fn=loss_fn,
                                         metrics=metrics,
                                         params=params)

        writer.add_scalars('Loss', {
            'Training': train_loss,
            'Validation': val_loss,
        }, epoch)

        for (train_metric_name,
             train_metric_results), (val_metric_name,
                                     val_metric_results) in zip(
                                         train_metrics.items(),
                                         val_metrics.items()):
            writer.add_scalars(
                train_metric_name, {
                    'Training': train_metric_results[0],
                    'Validation': val_metric_results[0],
                }, epoch)

        # get value for first metric
        current_value = list(val_metrics.values())[0][0]
        is_best = current_value >= best_value

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_value = current_value
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(log_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': opt.state_dict(),
                'scheduler_dict': lr_scheduler.state_dict(),
                'best_value': best_value
            },
            is_best=is_best,
            checkpoint_dir=checkpoint_dir,
            filename=ckpt_filename)

        logging.info("\ntrain loss: %.3f, val loss: %.3f" %
                     (train_loss, val_loss))
        for (train_metric_name,
             train_metric_results), (val_metric_name,
                                     val_metric_results) in zip(
                                         train_metrics.items(),
                                         val_metrics.items()):
            logging.info("train %s: %.3f, val %s: %.3f" %
                         (train_metric_name, train_metric_results[0],
                          val_metric_name, val_metric_results[0]))

        logging.info("-" * 20)
Example #13
0
    def train_and_evaluate(self, restore_from=None):
        """Train the model and evaluate every epoch.
        Args:
            restore_from: (string) directory or file containing weights to restore the graph
        """
        hp = self.hp
        experiment_dir = hp.experiment_dir

        tf.logging.info("Starting training for {} epoch(s)".format(
            hp.num_epochs))

        split = DatasetSplit.TRAIN
        train_model_spec = self._get_model_spec(split)

        split = DatasetSplit.EVAL
        eval_model_spec = self._get_model_spec(split)

        # Initialize tf.Saver instances to save weights during training
        last_saver = tf.train.Saver()  # will keep last 5 epochs
        best_saver = tf.train.Saver(
            max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
        begin_at_epoch = 0

        with tf.Session() as sess:
            # Initialize model variables
            sess.run(train_model_spec['variable_init_op'])

            # Reload weights from directory if specified
            if restore_from is not None:
                tf.logging.info(
                    "Restoring parameters from {}".format(restore_from))
                if os.path.isdir(restore_from):
                    restore_from = tf.train.latest_checkpoint(restore_from)
                    begin_at_epoch = int(restore_from.split('-')[-1])
                last_saver.restore(sess, restore_from)

            # For tensorboard (takes care of writing summaries to files)
            train_writer = tf.summary.FileWriter(
                os.path.join(experiment_dir, 'train_summaries'), sess.graph)
            eval_writer = tf.summary.FileWriter(
                os.path.join(experiment_dir, 'eval_summaries'), sess.graph)

            tf.gfile.MakeDirs(os.path.join(experiment_dir, 'last_weights'))
            tf.gfile.MakeDirs(os.path.join(experiment_dir, 'best_weights'))

            best_eval_p_acc = 0.0
            for epoch in range(begin_at_epoch, begin_at_epoch + hp.num_epochs):
                # Run one epoch
                # Compute number of batches in one epoch (one full pass over the training set)
                total_train_steps = (hp.train_size + hp.batch_size -
                                     1) // hp.batch_size
                total_train_steps_list = list(range(total_train_steps))

                length = int(np.ceil(total_train_steps / hp.eval_every))

                split_train_steps = np.array_split(total_train_steps_list,
                                                   length)
                split_train_steps = [len(l) for l in split_train_steps]

                total_eval_steps = (hp.dev_size + hp.batch_size -
                                    1) // hp.batch_size
                total_eval_steps_list = list(range(total_eval_steps))

                split_eval_steps = np.array_split(total_eval_steps_list,
                                                  length)
                split_eval_steps = [len(l) for l in split_eval_steps]

                for i, (t_steps, e_steps) in enumerate(
                        zip(split_train_steps, split_eval_steps)):
                    tf.logging.info(
                        "Epoch {} - {}/{} with {} train steps and {} eval steps"
                        .format(epoch + 1, i + 1, len(split_train_steps),
                                t_steps, e_steps))
                    reset = False
                    if i == 0:
                        reset = True

                    train_metrics = self.train_epoch(sess, train_model_spec,
                                                     t_steps, train_writer,
                                                     reset)
                    train_loss_string, train_acc_string = self.metrics_string(
                        train_metrics)
                    tf.logging.info("- Train metrics: " + train_acc_string)
                    tf.logging.info("- Train metrics: " + train_loss_string)

                    # Save weights
                    last_save_path = os.path.join(
                        experiment_dir, 'last_weights',
                        'after-epoch-{}'.format(epoch + 1))
                    last_saver.save(sess, last_save_path, global_step=i + 1)

                    # Evaluate for one sub epoch on validation set
                    eval_metrics = self.evaluate_epoch(sess, eval_model_spec,
                                                       e_steps, eval_writer,
                                                       reset)
                    test_loss_string, test_acc_string = self.metrics_string(
                        eval_metrics)
                    tf.logging.info("- Eval metrics: " + test_acc_string)
                    tf.logging.info("- Eval metrics: " + test_loss_string)

                    # If best_eval, best_save_path
                    eval_p_acc = eval_metrics['policy_accuracy']
                    if eval_p_acc >= best_eval_p_acc:
                        # Store new best accuracy
                        best_eval_p_acc = eval_p_acc
                        # Save weights
                        best_save_path = os.path.join(
                            experiment_dir, 'best_weights',
                            'after-epoch-{}'.format(epoch + 1))
                        best_save_path = best_saver.save(sess,
                                                         best_save_path,
                                                         global_step=i + 1)
                        tf.logging.info(
                            "- Found new best policy accuracy, saving in {}".
                            format(best_save_path))
                        # Save best train metrics in a json file in the model directory
                        best_json_path = os.path.join(
                            experiment_dir, "metrics_train_best_weights.json")
                        utils.save_dict_to_json(train_metrics, best_json_path)
                        # Save best eval metrics in a json file in the model directory
                        best_json_path = os.path.join(
                            experiment_dir, "metrics_eval_best_weights.json")
                        utils.save_dict_to_json(eval_metrics, best_json_path)

                    # Save latest train metrics in a json file in the model directory
                    last_json_path = os.path.join(
                        experiment_dir, "metrics_train_last_weights.json")
                    utils.save_dict_to_json(train_metrics, last_json_path)

                    # Save latest eval metrics in a json file in the model directory
                    last_json_path = os.path.join(
                        experiment_dir, "metrics_eval_last_weights.json")
                    utils.save_dict_to_json(eval_metrics, last_json_path)
Example #14
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    # if restore_file is not None:
    #     restore_path = os.path.join('experiments', params.exp_name, args.restore_file + '.pth.tar')
    #     logging.info("Restoring parameters from {}".format(restore_path))
    #     utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       logger,
                       restore_file=None,
                       add_noise=False,
                       noise_sigma=0.1):

    best_val_acc = 0.0
    # reload weights from restore_file if specified
    if restore_file is not None:
        logging.info("Restoring parameters from {}".format(restore_file))
        checkpoint = utils.load_checkpoint(restore_file, model, optimizer)
        params.start_epoch = checkpoint['epoch']

        best_val_acc = checkpoint['best_val_acc']
        print('best_val_acc=', best_val_acc, flush=True)
        print(optimizer.state_dict()['param_groups'][0]['lr'],
              checkpoint['epoch'],
              flush=True)

    # learning rate schedulers for different models:
    if params.lr_decay_type == None:
        logging.info("no lr decay")
    else:
        assert params.lr_decay_type in ['multistep', 'exp', 'plateau']
        logging.info("lr decay:{}".format(params.lr_decay_type))
    if params.lr_decay_type == 'multistep':
        scheduler = MultiStepLR(optimizer,
                                milestones=params.lr_step,
                                gamma=params.scheduler_gamma,
                                last_epoch=params.start_epoch - 1)

    elif params.lr_decay_type == 'exp':
        scheduler = ExponentialLR(optimizer,
                                  gamma=params.scheduler_gamma2,
                                  last_epoch=params.start_epoch - 1)
    elif params.lr_decay_type == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=params.scheduler_gamma3,
                                      patience=params.patience,
                                      verbose=False,
                                      threshold=0.0001,
                                      threshold_mode='rel',
                                      cooldown=0,
                                      min_lr=0,
                                      eps=1e-08)

    for epoch in range(params.start_epoch, params.num_epochs):
        params.current_epoch = epoch
        if params.lr_decay_type != 'plateau':
            scheduler.step()

        # Run one epoch
        logger.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train_metrics = train(model,
                              optimizer,
                              loss_fn,
                              train_dataloader,
                              metrics,
                              params,
                              logger,
                              add_noise=add_noise,
                              noise_sigma=noise_sigma)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model,
                               loss_fn,
                               val_dataloader,
                               metrics,
                               params,
                               logger,
                               add_noise=add_noise,
                               noise_sigma=noise_sigma)

        # vis logger
        accs = [
            100. * (1 - train_metrics['accuracytop1']),
            100. * (1 - train_metrics['accuracytop5']),
            100. * (1 - val_metrics['accuracytop1']),
            100. * (1 - val_metrics['accuracytop5']),
        ]

        losses = [train_metrics['loss'], val_metrics['loss']]

        if params.lr_decay_type == 'plateau':
            scheduler.step(val_metrics['ls_all'])

        val_acc = val_metrics['accuracytop1']
        is_best = val_acc >= best_val_acc
        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict(),
                'best_val_acc': best_val_acc
            },
            epoch=epoch + 1,
            is_best=is_best,
            save_best_ever_n_epoch=params.save_best_ever_n_epoch,
            checkpointpath=params.experiment_path + '/checkpoint',
            start_epoch=params.start_epoch)

        val_metrics['best_epoch'] = epoch + 1
        # If best_eval, best_save_path, metric
        if is_best:
            logger.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(params.experiment_path,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(params.experiment_path,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)