def create_model(args, device, logger, storage, storage_test): start_time = time.time() model = Model(args, logger, storage, storage_test, model_type=args.model_type) logger.info(model) logger.info('Trainable parameters:') for n, p in model.named_parameters(): logger.info('{} - {}'.format(n, p.shape)) logger.info("Number of trainable parameters: {}".format(utils.count_parameters(model))) logger.info("Estimated size (under fp32): {:.3f} MB".format(utils.count_parameters(model) * 4. / 10**6)) logger.info('Model init {:.3f}s'.format(time.time() - start_time)) return model
def test(args, recon_model): """ Performs evaluation of a pre-trained policy model. :param args: Argument object containing evaluation parameters. :param recon_model: reconstruction model. """ model, policy_args = load_policy_model( pathlib.Path(args.policy_model_checkpoint)) # Overwrite number of trajectories to test on policy_args.num_test_trajectories = args.num_test_trajectories if args.data_path is not None: # Overwrite data path if provided policy_args.data_path = args.data_path # Logging of policy model logging.info(args) logging.info(recon_model) logging.info(model) if args.wandb: wandb.config.update(args) wandb.watch(model, log='all') # Initialise summary writer writer = SummaryWriter(log_dir=policy_args.run_dir / 'summary') # Parameter counting logging.info( 'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(recon_model), count_trainable_parameters(recon_model), count_untrainable_parameters(recon_model))) logging.info( 'Policy model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(model), count_trainable_parameters(model), count_untrainable_parameters(model))) # Create data loader test_loader = create_data_loader(policy_args, 'test', shuffle=False) test_data_range_dict = create_data_range_dict(policy_args, test_loader) do_and_log_evaluation(policy_args, -1, recon_model, model, test_loader, writer, 'Test', test_data_range_dict) writer.close()
[am.named_parameters() for am in model.amortization_models]) for n, p in amortization_named_parameters: logger.info(f'{n} - {p.shape}') logger.info('AMORTIZATION PARAMETERS') for n, p in zip(transform_param_names, transform_params): logger.info(f'{n} - {p.shape}') logger.info('HYPERPRIOR PARAMETERS') for n, p in model.Hyperprior.hyperlatent_likelihood.named_parameters(): logger.info(f'{n} - {p.shape}') logger.info('DISCRIMINATOR PARAMETERS') for n, p in model.Discriminator.named_parameters(): logger.info(f'{n} - {p.shape}') logger.info("Number of trainable parameters: {}".format( utils.count_parameters(model))) logger.info("Estimated size: {} MB".format( utils.count_parameters(model) * 4. / 10**6)) shape = [10, 3, 256, 256] logger.info('Starting forward pass with input shape {}'.format(shape)) start_time = time.time() x = torch.randn(shape).to(device) losses = model(x) compression_loss, disc_loss = losses['compression'], losses['disc'] logger.info('Delta t {:.3f}s'.format(time.time() - start_time))
def train_and_eval(args, recon_args, recon_model): if args.resume: # Check that this works resumed = True new_run_dir = args.policy_model_checkpoint.parent data_path = args.data_path # In case models have been moved to a different machine, make sure the path to the recon model is the # path provided. recon_model_checkpoint = args.recon_model_checkpoint model, args, start_epoch, optimiser = load_policy_model(pathlib.Path( args.policy_model_checkpoint), optim=True) args.old_run_dir = args.run_dir args.old_recon_model_checkpoint = args.recon_model_checkpoint args.old_data_path = args.data_path args.recon_model_checkpoint = recon_model_checkpoint args.run_dir = new_run_dir args.data_path = data_path args.resume = True else: resumed = False # Improvement model to train model = build_policy_model(args) # Add mask parameters for training args = add_mask_params(args) if args.data_parallel: model = torch.nn.DataParallel(model) optimiser = build_optim(args, model.parameters()) start_epoch = 0 # Create directory to store results in savestr = '{}_res{}_al{}_accel{}_k{}_{}_{}'.format( args.dataset, args.resolution, args.acquisition_steps, args.accelerations, args.num_trajectories, datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), ''.join(choice(ascii_uppercase) for _ in range(5))) args.run_dir = args.exp_dir / savestr args.run_dir.mkdir(parents=True, exist_ok=False) args.resumed = resumed if args.wandb: allow_val_change = args.resumed # only allow changes if resumed: otherwise something is wrong. wandb.config.update(args, allow_val_change=allow_val_change) wandb.watch(model, log='all') # Logging logging.info(recon_model) logging.info(model) # Save arguments for bookkeeping args_dict = { key: str(value) for key, value in args.__dict__.items() if not key.startswith('__') and not callable(key) } save_json(args.run_dir / 'args.json', args_dict) # Initialise summary writer writer = SummaryWriter(log_dir=args.run_dir / 'summary') # Parameter counting logging.info( 'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(recon_model), count_trainable_parameters(recon_model), count_untrainable_parameters(recon_model))) logging.info( 'Policy model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(model), count_trainable_parameters(model), count_untrainable_parameters(model))) if args.scheduler_type == 'step': scheduler = torch.optim.lr_scheduler.StepLR(optimiser, args.lr_step_size, args.lr_gamma) elif args.scheduler_type == 'multistep': if not isinstance(args.lr_multi_step_size, list): args.lr_multi_step_size = [args.lr_multi_step_size] scheduler = torch.optim.lr_scheduler.MultiStepLR( optimiser, args.lr_multi_step_size, args.lr_gamma) else: raise ValueError( "{} is not a valid scheduler choice ('step', 'multistep')".format( args.scheduler_type)) # Create data loaders train_loader = create_data_loader(args, 'train', shuffle=True) dev_loader = create_data_loader(args, 'val', shuffle=False) train_data_range_dict = create_data_range_dict(args, train_loader) dev_data_range_dict = create_data_range_dict(args, dev_loader) if not args.resume: if args.do_train_ssim: do_and_log_evaluation(args, -1, recon_model, model, train_loader, writer, 'Train', train_data_range_dict) do_and_log_evaluation(args, -1, recon_model, model, dev_loader, writer, 'Val', dev_data_range_dict) for epoch in range(start_epoch, args.num_epochs): train_loss, train_time = train_epoch(args, epoch, recon_model, model, train_loader, optimiser, writer, train_data_range_dict) logging.info( f'Epoch = [{epoch+1:3d}/{args.num_epochs:3d}] TrainLoss = {train_loss:.3g} TrainTime = {train_time:.2f}s ' ) if args.do_train_ssim: do_and_log_evaluation(args, epoch, recon_model, model, train_loader, writer, 'Train', train_data_range_dict) do_and_log_evaluation(args, epoch, recon_model, model, dev_loader, writer, 'Val', dev_data_range_dict) scheduler.step() save_policy_model(args, args.run_dir, epoch, model, optimiser) writer.close()