Esempio n. 1
0
def create_model(args, device, logger, storage, storage_test):

    start_time = time.time()
    model = Model(args, logger, storage, storage_test, model_type=args.model_type)
    logger.info(model)
    logger.info('Trainable parameters:')

    for n, p in model.named_parameters():
        logger.info('{} - {}'.format(n, p.shape))

    logger.info("Number of trainable parameters: {}".format(utils.count_parameters(model)))
    logger.info("Estimated size (under fp32): {:.3f} MB".format(utils.count_parameters(model) * 4. / 10**6))
    logger.info('Model init {:.3f}s'.format(time.time() - start_time))

    return model
Esempio n. 2
0
def test(args, recon_model):
    """
    Performs evaluation of a pre-trained policy model.

    :param args: Argument object containing evaluation parameters.
    :param recon_model: reconstruction model.
    """
    model, policy_args = load_policy_model(
        pathlib.Path(args.policy_model_checkpoint))

    # Overwrite number of trajectories to test on
    policy_args.num_test_trajectories = args.num_test_trajectories
    if args.data_path is not None:  # Overwrite data path if provided
        policy_args.data_path = args.data_path

    # Logging of policy model
    logging.info(args)
    logging.info(recon_model)
    logging.info(model)
    if args.wandb:
        wandb.config.update(args)
        wandb.watch(model, log='all')
    # Initialise summary writer
    writer = SummaryWriter(log_dir=policy_args.run_dir / 'summary')

    # Parameter counting
    logging.info(
        'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(recon_model),
                count_trainable_parameters(recon_model),
                count_untrainable_parameters(recon_model)))
    logging.info(
        'Policy model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(model), count_trainable_parameters(model),
                count_untrainable_parameters(model)))

    # Create data loader
    test_loader = create_data_loader(policy_args, 'test', shuffle=False)
    test_data_range_dict = create_data_range_dict(policy_args, test_loader)

    do_and_log_evaluation(policy_args, -1, recon_model, model, test_loader,
                          writer, 'Test', test_data_range_dict)

    writer.close()
        [am.named_parameters() for am in model.amortization_models])
    for n, p in amortization_named_parameters:
        logger.info(f'{n} - {p.shape}')

    logger.info('AMORTIZATION PARAMETERS')
    for n, p in zip(transform_param_names, transform_params):
        logger.info(f'{n} - {p.shape}')

    logger.info('HYPERPRIOR PARAMETERS')
    for n, p in model.Hyperprior.hyperlatent_likelihood.named_parameters():
        logger.info(f'{n} - {p.shape}')

    logger.info('DISCRIMINATOR PARAMETERS')
    for n, p in model.Discriminator.named_parameters():
        logger.info(f'{n} - {p.shape}')

    logger.info("Number of trainable parameters: {}".format(
        utils.count_parameters(model)))
    logger.info("Estimated size: {} MB".format(
        utils.count_parameters(model) * 4. / 10**6))

    shape = [10, 3, 256, 256]
    logger.info('Starting forward pass with input shape {}'.format(shape))

    start_time = time.time()
    x = torch.randn(shape).to(device)
    losses = model(x)
    compression_loss, disc_loss = losses['compression'], losses['disc']

    logger.info('Delta t {:.3f}s'.format(time.time() - start_time))
Esempio n. 4
0
def train_and_eval(args, recon_args, recon_model):
    if args.resume:
        # Check that this works
        resumed = True
        new_run_dir = args.policy_model_checkpoint.parent
        data_path = args.data_path
        # In case models have been moved to a different machine, make sure the path to the recon model is the
        # path provided.
        recon_model_checkpoint = args.recon_model_checkpoint

        model, args, start_epoch, optimiser = load_policy_model(pathlib.Path(
            args.policy_model_checkpoint),
                                                                optim=True)

        args.old_run_dir = args.run_dir
        args.old_recon_model_checkpoint = args.recon_model_checkpoint
        args.old_data_path = args.data_path

        args.recon_model_checkpoint = recon_model_checkpoint
        args.run_dir = new_run_dir
        args.data_path = data_path
        args.resume = True
    else:
        resumed = False
        # Improvement model to train
        model = build_policy_model(args)
        # Add mask parameters for training
        args = add_mask_params(args)
        if args.data_parallel:
            model = torch.nn.DataParallel(model)
        optimiser = build_optim(args, model.parameters())
        start_epoch = 0
        # Create directory to store results in
        savestr = '{}_res{}_al{}_accel{}_k{}_{}_{}'.format(
            args.dataset, args.resolution, args.acquisition_steps,
            args.accelerations, args.num_trajectories,
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
            ''.join(choice(ascii_uppercase) for _ in range(5)))
        args.run_dir = args.exp_dir / savestr
        args.run_dir.mkdir(parents=True, exist_ok=False)

    args.resumed = resumed

    if args.wandb:
        allow_val_change = args.resumed  # only allow changes if resumed: otherwise something is wrong.
        wandb.config.update(args, allow_val_change=allow_val_change)
        wandb.watch(model, log='all')

    # Logging
    logging.info(recon_model)
    logging.info(model)
    # Save arguments for bookkeeping
    args_dict = {
        key: str(value)
        for key, value in args.__dict__.items()
        if not key.startswith('__') and not callable(key)
    }
    save_json(args.run_dir / 'args.json', args_dict)

    # Initialise summary writer
    writer = SummaryWriter(log_dir=args.run_dir / 'summary')

    # Parameter counting
    logging.info(
        'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(recon_model),
                count_trainable_parameters(recon_model),
                count_untrainable_parameters(recon_model)))
    logging.info(
        'Policy model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(model), count_trainable_parameters(model),
                count_untrainable_parameters(model)))

    if args.scheduler_type == 'step':
        scheduler = torch.optim.lr_scheduler.StepLR(optimiser,
                                                    args.lr_step_size,
                                                    args.lr_gamma)
    elif args.scheduler_type == 'multistep':
        if not isinstance(args.lr_multi_step_size, list):
            args.lr_multi_step_size = [args.lr_multi_step_size]
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimiser, args.lr_multi_step_size, args.lr_gamma)
    else:
        raise ValueError(
            "{} is not a valid scheduler choice ('step', 'multistep')".format(
                args.scheduler_type))

    # Create data loaders
    train_loader = create_data_loader(args, 'train', shuffle=True)
    dev_loader = create_data_loader(args, 'val', shuffle=False)

    train_data_range_dict = create_data_range_dict(args, train_loader)
    dev_data_range_dict = create_data_range_dict(args, dev_loader)

    if not args.resume:
        if args.do_train_ssim:
            do_and_log_evaluation(args, -1, recon_model, model, train_loader,
                                  writer, 'Train', train_data_range_dict)
        do_and_log_evaluation(args, -1, recon_model, model, dev_loader, writer,
                              'Val', dev_data_range_dict)

    for epoch in range(start_epoch, args.num_epochs):
        train_loss, train_time = train_epoch(args, epoch, recon_model, model,
                                             train_loader, optimiser, writer,
                                             train_data_range_dict)
        logging.info(
            f'Epoch = [{epoch+1:3d}/{args.num_epochs:3d}] TrainLoss = {train_loss:.3g} TrainTime = {train_time:.2f}s '
        )

        if args.do_train_ssim:
            do_and_log_evaluation(args, epoch, recon_model, model,
                                  train_loader, writer, 'Train',
                                  train_data_range_dict)
        do_and_log_evaluation(args, epoch, recon_model, model, dev_loader,
                              writer, 'Val', dev_data_range_dict)

        scheduler.step()
        save_policy_model(args, args.run_dir, epoch, model, optimiser)
    writer.close()