Esempio n. 1
0
def main(args):
    root_dir = dirname(abspath(__file__))
    # Load the parameters from json file
    imagenet_dir = args.data_dir
    exp_dir = join(root_dir, 'training', 'experiments', args.exp_name)
    json_path = join(exp_dir, 'parameters.json')
    assert isfile(json_path), (
        "No json configuration file found at {}".format(json_path))
    params = train_utils.Params(json_path)
    # Add the timer option to the parameters
    params.update_with_dict({'timer': args.timer})
    params.update_with_dict({'num_workers': args.num_workers})

    train_utils.set_logger(join(exp_dir, '{}.log'.format(args.mode)))
    logging.info("----Starting train script in mode: {}----".format(args.mode))

    setup_timer = Timer(convert=True)
    setup_timer.reset()
    logging.info("Loading datasets...")

    # Get the correct model
    if params.model == 'BaselineEmbeddingNet':
        model = mdl.SiameseNet(mdl.BaselineEmbeddingNet(),
                               upscale=params.upscale,
                               corr_map_size=33,
                               stride=4)
    elif params.model == 'VGG11EmbeddingNet_5c':
        model = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(),
                               upscale=params.upscale,
                               corr_map_size=33,
                               stride=4)
    elif params.model == 'VGG16EmbeddingNet_8c':
        model = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(),
                               upscale=params.upscale,
                               corr_map_size=33,
                               stride=4)

    # Freeze all the indicated parameters
    for i, (name, parameter) in enumerate(model.named_parameters()):
        if i in params.parameter_freeze:
            logging.info("Freezing parameter {}".format(name))
            parameter.requires_grad = False

    model = model.to(device)
    # Set the tensorboard summary maker
    summ_maker = SummaryMaker(join(exp_dir, 'tensorboard'), params,
                              model.upscale_factor)

    label_function = create_BCELogit_loss_label
    img_read_fcn = imutils.get_decode_jpeg_fcn(flag=args.imutils_flag)
    img_resize_fcn = imutils.get_resize_fcn(flag=args.imutils_flag)

    logging.info("Validation dataset...")

    metadata_val_file = join(exp_dir, "metadata.val")
    val_set = ImageNetVID_val(imagenet_dir,
                              label_fcn=label_function,
                              pos_thr=params.pos_thr,
                              neg_thr=params.neg_thr,
                              upscale_factor=model.upscale_factor,
                              cxt_margin=params.context_margin,
                              reference_size=params.reference_sz,
                              search_size=params.search_sz,
                              img_read_fcn=img_read_fcn,
                              resize_fcn=img_resize_fcn,
                              metadata_file=metadata_val_file,
                              save_metadata=metadata_val_file,
                              max_frame_sep=params.max_frame_sep)
    val_loader = DataLoader(val_set,
                            batch_size=params.batch_size,
                            shuffle=False,
                            num_workers=params.num_workers,
                            pin_memory=True)
    if params.eval_epoch_size > len(val_loader):
        logging.info('The user set eval_epoch_size ({}) is bigger than the '
                     'size of the eval set ({}). \n Setting '
                     'eval_epoch_size to the eval set size.'.format(
                         params.eval_epoch_size, len(val_loader)))
        params.eval_epoch_size = len(val_loader)

    # Define the model and optimizer

    # fetch loss function and metrics
    loss_fn = losses.BCELogit_Loss
    metrics = met.METRICS
    # Set the optional keyword arguments for the functions that need it
    metrics['center_error']['kwargs']['upscale_factor'] = model.upscale_factor

    try:
        if args.mode == 'train':

            logging.info("Training dataset...")

            metadata_train_file = join(exp_dir, "metadata.train")
            train_set = ImageNetVID(imagenet_dir,
                                    label_fcn=label_function,
                                    pos_thr=params.pos_thr,
                                    neg_thr=params.neg_thr,
                                    upscale_factor=model.upscale_factor,
                                    cxt_margin=params.context_margin,
                                    reference_size=params.reference_sz,
                                    search_size=params.search_sz,
                                    img_read_fcn=img_read_fcn,
                                    resize_fcn=img_resize_fcn,
                                    metadata_file=metadata_train_file,
                                    save_metadata=metadata_train_file,
                                    max_frame_sep=params.max_frame_sep)
            train_loader = DataLoader(train_set,
                                      batch_size=params.batch_size,
                                      shuffle=True,
                                      num_workers=params.num_workers,
                                      pin_memory=True)

            # Though I'm not a big fan of changing the value of a parameter
            # variable after it has been read, at least I let the user know I'm
            # changing it.
            if params.train_epoch_size > len(train_loader):
                logging.info(
                    'The user set train_epoch_size ({}) is bigger than the '
                    'size of the train set ({}). \n Setting '
                    'train_epoch_size to the train set size.'.format(
                        params.train_epoch_size, len(train_loader)))
                params.train_epoch_size = len(train_loader)

            logging.info("Done")
            logging.info("Setup time: {}".format(setup_timer.elapsed))
            parameters = filter(lambda p: p.requires_grad, model.parameters())
            optimizer = optimz.OPTIMIZERS[params.optim](parameters,
                                                        **params.optim_kwargs)
            # Set the scheduler, that updates the learning rate using a exponential
            # decay. If you don't want lr decay set it to 1.
            logging.info("Using Exponential Learning Rate Decay of {}".format(
                params.lr_decay))
            scheduler = torch.optim.lr_scheduler.ExponentialLR(
                optimizer, params.lr_decay)

            logging.info("Epoch sizes: {} in train and {} in eval".format(
                params.train_epoch_size, params.eval_epoch_size))

            logging.info("Starting training for {} epoch(s)".format(
                params.num_epochs))
            with Timer(convert=True) as t:
                train_and_evaluate(model,
                                   train_loader,
                                   val_loader,
                                   optimizer,
                                   scheduler,
                                   loss_fn,
                                   metrics,
                                   params,
                                   exp_dir,
                                   args,
                                   summ_maker=summ_maker)
            if params.timer:
                logging.info(
                    "[profiling] Total time to train {} epochs, with {}"
                    " elements on training dataset and {} "
                    "on val dataset: {}".format(params.num_epochs,
                                                len(train_loader),
                                                len(val_loader), t.elapsed))

        elif args.mode == 'eval':
            logging.info("Done")
            with Timer(convert=True) as total:
                logging.info("Starting evaluation")
                # TODO write a decent Exception
                if args.restore_file is None:
                    raise IncompleteArgument("In eval mode you have to specify"
                                             " a model checkpoint to be loaded"
                                             " and evaluated."
                                             " E.g: --restore_file best")
                checkpoint_path = join(exp_dir, args.restore_file + '.pth.tar')
                train_utils.load_checkpoint(checkpoint_path, model)
                # Evaluate
                summ_maker.epoch = 0
                test_metrics = evaluate(model,
                                        loss_fn,
                                        val_loader,
                                        metrics,
                                        params,
                                        args,
                                        summ_maker=summ_maker)
                save_path = join(
                    exp_dir, "metrics_test_{}.json".format(args.restore_file))
                train_utils.save_dict_to_json(test_metrics, save_path)
            if params.timer:
                logging.info("[profiling] Total evaluation time: {}".format(
                    total.elapsed))

    except KeyboardInterrupt:
        logging.info("=== User interrupted execution ===")
        raise
    except Exception as e:
        logging.exception("Fatal error in main loop")
        logging.info("=== Execution Terminated with error ===")
    else:
        logging.info("=== Execution exited normally ===")
Esempio n. 2
0
def main(args):
    root_dir = dirname(abspath(__file__))
    # Load the parameters from json file
    imagenet_dir = args.data_dir
    exp_dir = join(root_dir, 'training', 'experiments', args.exp_name)
    json_path = join(exp_dir, 'parameters.json')
    assert isfile(json_path), ("No json configuration file found at {}"
                               .format(json_path))
    params = train_utils.Params(json_path)
    # Add the timer option to the parameters
    params.update_with_dict({'timer': args.timer})
    params.update_with_dict({'num_workers': args.num_workers})

    train_utils.set_logger(join(exp_dir, '{}.log'.format(args.mode)))
    logging.info("----Starting train script in mode: {}----".format(args.mode))

    setup_timer = Timer(convert=True)
    setup_timer.reset()
    logging.info("Loading datasets...")

    # Get the correct model
    if params.model == 'BaselineEmbeddingNet':
        model = mdl.SiameseNet(mdl.BaselineEmbeddingNet(), upscale=params.upscale,
                               corr_map_size=33, stride=4)
    elif params.model == 'VGG11EmbeddingNet_5c':
        model = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(), upscale=params.upscale,
                               corr_map_size=33, stride=4)
    elif params.model == 'VGG16EmbeddingNet_8c':
        model = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(), upscale=params.upscale,
                               corr_map_size=33, stride=4)

    # Freeze all the indicated parameters
    for i, (name, parameter) in enumerate(model.named_parameters()):
        if i in params.parameter_freeze:
            logging.info("Freezing parameter {}".format(name))
            parameter.requires_grad = False

    model = model.to(device)
    # Set the tensorboard summary maker
    summ_maker = SummaryMaker(join(exp_dir, 'tensorboard'),
                              params,
                              model.upscale_factor)

    label_function = create_BCELogit_loss_label
    img_read_fcn = imutils.get_decode_jpeg_fcn(flag=args.imutils_flag)
    img_resize_fcn = imutils.get_resize_fcn(flag=args.imutils_flag)

    logging.info("Validation dataset...")

    metadata_val_file = join(exp_dir, "metadata.val")
    val_set = ImageNetVID_val(imagenet_dir,
                              label_fcn=label_function,
                              pos_thr=params.pos_thr,
                              neg_thr=params.neg_thr,
                              upscale_factor=model.upscale_factor,
                              cxt_margin=params.context_margin,
                              reference_size=params.reference_sz,
                              search_size=params.search_sz,
                              img_read_fcn=img_read_fcn,
                              resize_fcn=img_resize_fcn,
                              metadata_file=metadata_val_file,
                              save_metadata=metadata_val_file,
                              max_frame_sep=params.max_frame_sep)
    val_loader = DataLoader(val_set, batch_size=params.batch_size,
                            shuffle=False, num_workers=params.num_workers,
                            pin_memory=True)
    if params.eval_epoch_size > len(val_loader):
        logging.info('The user set eval_epoch_size ({}) is bigger than the '
                     'size of the eval set ({}). \n Setting '
                     'eval_epoch_size to the eval set size.'
                     .format(params.eval_epoch_size, len(val_loader)))
        params.eval_epoch_size = len(val_loader)

    # Define the model and optimizer

    # fetch loss function and metrics
    loss_fn = losses.BCELogit_Loss
    metrics = met.METRICS
    # Set the optional keyword arguments for the functions that need it
    metrics['center_error']['kwargs']['upscale_factor'] = model.upscale_factor

    try:
        if args.mode == 'train':

            logging.info("Training dataset...")

            metadata_train_file = join(exp_dir, "metadata.train")
            train_set = ImageNetVID(imagenet_dir,
                                    label_fcn=label_function,
                                    pos_thr=params.pos_thr,
                                    neg_thr=params.neg_thr,
                                    upscale_factor=model.upscale_factor,
                                    cxt_margin=params.context_margin,
                                    reference_size=params.reference_sz,
                                    search_size=params.search_sz,
                                    img_read_fcn=img_read_fcn,
                                    resize_fcn=img_resize_fcn,
                                    metadata_file=metadata_train_file,
                                    save_metadata=metadata_train_file,
                                    max_frame_sep=params.max_frame_sep)
            train_loader = DataLoader(train_set, batch_size=params.batch_size,
                                      shuffle=True, num_workers=params.num_workers,
                                      pin_memory=True)

            # Though I'm not a big fan of changing the value of a parameter
            # variable after it has been read, at least I let the user know I'm
            # changing it.
            if params.train_epoch_size > len(train_loader):
                logging.info('The user set train_epoch_size ({}) is bigger than the '
                             'size of the train set ({}). \n Setting '
                             'train_epoch_size to the train set size.'
                             .format(params.train_epoch_size, len(train_loader)))
                params.train_epoch_size = len(train_loader)

            logging.info("Done")
            logging.info("Setup time: {}".format(setup_timer.elapsed))
            parameters = filter(lambda p: p.requires_grad,model.parameters())
            optimizer = optimz.OPTIMIZERS[params.optim](parameters, **params.optim_kwargs)
            # Set the scheduler, that updates the learning rate using a exponential
            # decay. If you don't want lr decay set it to 1.
            logging.info("Using Exponential Learning Rate Decay of {}".format(params.lr_decay))
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, params.lr_decay)

            logging.info("Epoch sizes: {} in train and {} in eval"
                         .format(params.train_epoch_size, params.eval_epoch_size))

            logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
            with Timer(convert=True) as t:
                train_and_evaluate(model, train_loader, val_loader, optimizer,
                                   scheduler, loss_fn, metrics, params, exp_dir,
                                   args, summ_maker=summ_maker)
            if params.timer:
                logging.info("[profiling] Total time to train {} epochs, with {}"
                             " elements on training dataset and {} "
                             "on val dataset: {}"
                             .format(params.num_epochs, len(train_loader),
                                     len(val_loader), t.elapsed))

        elif args.mode == 'eval':
            logging.info("Done")
            with Timer(convert=True) as total:
                logging.info("Starting evaluation")
                # TODO write a decent Exception
                if args.restore_file is None:
                    raise IncompleteArgument("In eval mode you have to specify"
                                             " a model checkpoint to be loaded"
                                             " and evaluated."
                                             " E.g: --restore_file best")
                checkpoint_path = join(exp_dir, args.restore_file + 'pth.tar')
                train_utils.load_checkpoint(checkpoint_path, model)
                # Evaluate
                summ_maker.epoch = 0
                test_metrics = evaluate(model, loss_fn, val_loader, metrics,
                                        params, args, summ_maker=summ_maker)
                save_path = join(exp_dir,
                                 "metrics_test_{}.json".format(args.restore_file))
                train_utils.save_dict_to_json(test_metrics, save_path)
            if params.timer:
                logging.info("[profiling] Total evaluation time: {}"
                             .format(total.elapsed))

    except KeyboardInterrupt:
        logging.info("=== User interrupted execution ===")
        raise
    except Exception as e:
        logging.exception("Fatal error in main loop")
        logging.info("=== Execution Terminated with error ===")
    else:
        logging.info("=== Execution exited normally ===")
Esempio n. 3
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       scheduler,
                       loss_fn,
                       metrics,
                       params,
                       exp_dir,
                       args,
                       summ_maker=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object
            that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
            fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential
            learning rate scheduler.
        loss_fn: a function that takes batch_output and batch_labels and
            computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using
            the output and labels of each batch
        params: (Params) hyperparameters
        exp_dir: (string) directory containing the parameters, weights and
            logs for the current experiment. The full path.
        args: The parser object containing the user informed arguments
        summ_maker: The SummaryMaker object that writes the training information
        to a tensorboard-readable file.
    """
    # reload weights from restore_file if specified
    # TODO load and set best validation error
    if args.restore_file is not None:
        restore_path = join(exp_dir, (args.restore_file + '.pth.tar'))
        logging.info("Restoring parameters from {}".format(restore_path))
        train_utils.load_checkpoint(restore_path, model)

    # best_val_c_error = float("inf")
    best_val_auc = 0
    # Before starting the first epoch do the eval
    logging.info('Pretraining evaluation...')
    # Epoch 0 is the validation epoch before the learning starts.
    summ_maker.epoch = 0
    val_metrics = evaluate(model,
                           loss_fn,
                           val_dataloader,
                           metrics,
                           params,
                           args,
                           summ_maker=summ_maker)

    for epoch in range(params.num_epochs):
        # The first epoch after training is 1 not 0
        summ_maker.epoch = epoch + 1
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model,
              optimizer,
              loss_fn,
              train_dataloader,
              metrics,
              params,
              summ_maker=summ_maker)

        # Update the Learning rate
        scheduler.step()

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model,
                               loss_fn,
                               val_dataloader,
                               metrics,
                               params,
                               args,
                               summ_maker=summ_maker)

        val_auc = val_metrics['AUC']
        is_best = val_auc >= best_val_auc

        # Save weights
        train_utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=exp_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best auc")
            best_val_auc = val_auc

            # Save best val metrics in a json file in the model directory
            best_json_path = join(exp_dir, "metrics_val_best_weights.json")
            train_utils.save_dict_to_json(val_metrics, best_json_path)
            pass

        # Save latest val metrics in a json file in the model directory
        last_json_path = join(exp_dir, "metrics_val_last_weights.json")
        train_utils.save_dict_to_json(val_metrics, last_json_path)
Esempio n. 4
0
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler,
                       loss_fn, metrics, params, exp_dir, args, summ_maker=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object
            that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
            fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential
            learning rate scheduler.
        loss_fn: a function that takes batch_output and batch_labels and
            computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using
            the output and labels of each batch
        params: (Params) hyperparameters
        exp_dir: (string) directory containing the parameters, weights and
            logs for the current experiment. The full path.
        args: The parser object containing the user informed arguments
        summ_maker: The SummaryMaker object that writes the training information
        to a tensorboard-readable file.
    """
    # reload weights from restore_file if specified
    # TODO load and set best validation error
    if args.restore_file is not None:
        restore_path = join(exp_dir, (args.restore_file + '.pth.tar'))
        logging.info("Restoring parameters from {}".format(restore_path))
        train_utils.load_checkpoint(restore_path, model)

    # best_val_c_error = float("inf")
    best_val_auc = 0
    # Before starting the first epoch do the eval
    logging.info('Pretraining evaluation...')
    # Epoch 0 is the validation epoch before the learning starts.
    summ_maker.epoch = 0
    val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args,
                           summ_maker=summ_maker)

    for epoch in range(params.num_epochs):
        # The first epoch after training is 1 not 0
        summ_maker.epoch = epoch + 1
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params,
              summ_maker=summ_maker)

        # Update the Learning rate
        scheduler.step()

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params,
                               args, summ_maker=summ_maker)

        val_auc = val_metrics['AUC']
        is_best = val_auc >= best_val_auc

        # Save weights
        train_utils.save_checkpoint({'epoch': epoch + 1,
                                     'state_dict': model.state_dict(),
                                     'optim_dict': optimizer.state_dict()},
                                    is_best=is_best,
                                    checkpoint=exp_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best auc")
            best_val_auc = val_auc

            # Save best val metrics in a json file in the model directory
            best_json_path = join(exp_dir, "metrics_val_best_weights.json")
            train_utils.save_dict_to_json(val_metrics, best_json_path)
            pass

        # Save latest val metrics in a json file in the model directory
        last_json_path = join(exp_dir, "metrics_val_last_weights.json")
        train_utils.save_dict_to_json(val_metrics, last_json_path)