def _setup(self, config): """Run when the experiment starts Args: config: dictionary of parameters used in the hyperparameter optimization. The keys of this dict are:lr, momentum, n_latent, n_fmaps """ num_epoch_perTrain = 1 trainer_run_root = '/u/alik/tmpscratch/runlogs/AE_2d/hpOptim_test/' # create directory if not preset if not (os.path.isdir(trainer_run_root)): os.mkdir(trainer_run_root) trainer_log_int = 128 trainer_save = True device = 'cuda' # Model parameters input_size = 302 output_size = input_size kernel_size = 3 stride = 1 n_fmaps = config.get("n_fmaps") n_latent = config.get("n_latent") cur_Model = AE( Encoder_4_sampling_bn(input_size, kernel_size, stride, n_fmaps, n_latent), Decoder_4_sampling_bn(output_size, kernel_size, stride, n_fmaps, n_latent)) cur_Criterion = torch.nn.MSELoss() cur_Optimizer = torch.optim.SGD(cur_Model.parameters(), lr=config.get("lr"), momentum=config.get("momentum")) self.trainer = Trainer( run_root=trainer_run_root, model=cur_Model, optimizer=cur_Optimizer, criterion=cur_Criterion, train_loader=config.get("train_loader"), validation_loader=config.get("validation_loader"), num_epoch=num_epoch_perTrain, log_int=trainer_log_int, device=device, save=trainer_save)
model = AE( Encoder_4_sampling_bn_1px_deep(input_size, kernel_size, stride, n_fmaps, n_latent), Decoder_4_sampling_bn_1px_deep(output_size, kernel_size, stride, n_fmaps, n_latent)) criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.025, momentum=0.8) num_epoch = 1 log_int = 100 device = 'cpu' save = True resume = False trainer = Trainer(run_root=run_root, model=model, optimizer=optimizer, criterion=criterion, train_loader=train_loader, validation_loader=validation_loader, num_epoch=num_epoch, log_int=log_int, device=device, save=save, resume=resume) beforeT = time.time() trainer.train() timeSpent = time.time() - beforeT print(f'Total time spent training: {timeSpent:.2f}')
norm_mean=norm_mean, norm_std=norm_std, cache_RAM=True, cache_HDD=True, cache_HDD_root=cache_root, ) dataloader = DataLoader(dataset, batch_size=24, shuffle=False, num_workers=0) input_size = 302 output_size = input_size valid_size = 17 kernel_size = 3 stride = 1 n_fmaps = 8 n_latent = 5000 model = AE( Encoder_4_sampling_bn(input_size, kernel_size, stride, n_fmaps, n_latent), Decoder_4_sampling_bn(output_size, kernel_size, stride, n_fmaps, n_latent)) criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.025, momentum=0.8) num_epoch = 10 log_int = 10 device = 'cpu' trainer = Trainer(run_root, dataloader, model, optimizer, criterion, num_epoch, log_int, device) trainer.train()
class trainable(tune.Trainable): """Implementation of tune's hyper parameter optimization subclass""" def _setup(self, config): """Run when the experiment starts Args: config: dictionary of parameters used in the hyperparameter optimization. The keys of this dict are:lr, momentum, n_latent, n_fmaps """ num_epoch_perTrain = 1 trainer_run_root = '/u/alik/tmpscratch/runlogs/AE_2d/hpOptim_test/' # create directory if not preset if not (os.path.isdir(trainer_run_root)): os.mkdir(trainer_run_root) trainer_log_int = 128 trainer_save = True device = 'cuda' # Model parameters input_size = 302 output_size = input_size kernel_size = 3 stride = 1 n_fmaps = config.get("n_fmaps") n_latent = config.get("n_latent") cur_Model = AE( Encoder_4_sampling_bn(input_size, kernel_size, stride, n_fmaps, n_latent), Decoder_4_sampling_bn(output_size, kernel_size, stride, n_fmaps, n_latent)) cur_Criterion = torch.nn.MSELoss() cur_Optimizer = torch.optim.SGD(cur_Model.parameters(), lr=config.get("lr"), momentum=config.get("momentum")) self.trainer = Trainer( run_root=trainer_run_root, model=cur_Model, optimizer=cur_Optimizer, criterion=cur_Criterion, train_loader=config.get("train_loader"), validation_loader=config.get("validation_loader"), num_epoch=num_epoch_perTrain, log_int=trainer_log_int, device=device, save=trainer_save) def _train(self): """A single iteration of this method is run for each call. This should take more than a few econds and less than a few minutes""" val_loss_dict = self.trainer.train() return val_loss_dict def _save(self, tmp_checkpoint_dir): """ Saving the model """ checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth") torch.save(self.trainer.model.state_dict(), checkpoint_path) return tmp_checkpoint_dir def _restore(self, tmp_checkpoint_dir): """ Restoring the model""" checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth") self.trainer.model.load_state_dict(torch.load(checkpoint_path))