def init(config, _run): args = SimpleNamespace(**config) assertions.validate_hypers(args) mlh.seed_all(args.seed) args.data_path = assertions.validate_dataset_path(args) if args.activation is not None: if 'relu' in args.activation: args.activation = torch.nn.ReLU() elif 'elu' in args.activation: args.activation = torch.nn.ELU() else: args.activation = torch.nn.ReLU() args._run = _run Path(args.artifact_dir).mkdir(exist_ok=True) args.loss_name = args.loss if args.cuda and torch.cuda.is_available(): args.device = torch.device('cuda') args.cuda = True else: args.device = torch.device('cpu') args.cuda = False args.partition_scheduler = updates.get_partition_scheduler(args) args.partition = util.get_partition(args) args.data_path = Path(args.data_path) return args
def init(config, _run): args = SimpleNamespace(**config) assertions.validate_hypers(args) mlh.seed_all(args.seed) args.data_path = assertions.validate_dataset_path(args) if args.activation is not None: if 'relu' in args.activation: args.activation = torch.nn.ReLU() elif 'elu' in args.activation: args.activation = torch.nn.ELU() else: args.activation = torch.nn.ReLU() args._run = _run args.model_dir = args.artifact_dir if args.checkpoint or args.record: unique_directory = Path(args.model_dir) / str(uuid.uuid4()) unique_directory.mkdir(parents=True) args.unique_directory = unique_directory # Save args json for grepability with open(args.unique_directory / 'args.json', 'w') as outfile: json.dump(dict(config), outfile, indent=4) args.loss_name = args.loss if args.cuda and torch.cuda.is_available(): args.device = torch.device('cuda') args.cuda = True else: args.device = torch.device('cpu') args.cuda = False args.partition_scheduler = updates.get_partition_scheduler(args) args.partition = util.get_partition(args) args.per_batch = False if (args.per_batch and args.per_sample) else args.per_batch args.data_path = Path(args.data_path) return args
def init(config, run): # general init args = SimpleNamespace(**config) args = assertions.validate_args(args) mlh.seed_all(args.seed) args._run = run args.wandb = wandb # init scheduler args.partition_scheduler = schedules.get_partition_scheduler(args) args.partition = util.get_partition(args) # init data train_data_loader, test_data_loader = get_data(args) args.train_data_loader = train_data_loader args.test_data_loader = test_data_loader # init model model = get_model(train_data_loader, args) # init optimizer model.init_optimizer() return model, args
def train(args): # read data with args._run.open_resource(args.data_path, 'rb') as file_handle: data = pickle.load(file_handle) train_image = data['train_image'] test_image = data['test_image'] train_data_loader = get_data_loader(train_image, args.batch_size, args) test_data_loader = get_data_loader(test_image, args.test_batch_size, args) # Make models train_obs_mean = util.tensor(np.mean(train_image, axis=0), args) generative_model, inference_network = util.init_models( train_obs_mean, args) # Make partition args.partition = util.get_partition(args.K, args.partition_type, args.log_beta_min, args.device) # Make optimizer parameters = itertools.chain.from_iterable( [x.parameters() for x in [generative_model, inference_network]]) optimizer = torch.optim.Adam(parameters, lr=args.lr) for epoch in range(args.epochs): epoch_train_elbo = 0 for idx, data in enumerate(train_data_loader): optimizer.zero_grad() loss, elbo = args.loss(generative_model, inference_network, data, args, args.valid_S) #TODO add alpha lower bound loss.backward() optimizer.step() epoch_train_elbo += elbo.item() if (args.save_grads and (epoch % args.test_frequency) == 0): # Save grads grad_variance = util.calculate_grad_variance( generative_model, inference_network, data, args) log_scalar("grad.variance", grad_variance, epoch, verbose=True) if torch.isnan(loss): break epoch_train_elbo = epoch_train_elbo / len(train_data_loader) log_scalar("train.elbo", epoch_train_elbo, epoch) if (args.checkpoint and (epoch != 0) and ((epoch % args.checkpoint_frequency) == 0)): save_checkpoint(generative_model, inference_network, epoch, epoch_train_elbo, optimizer, args) if args.train_only: continue # run test set if (epoch == (args.epochs - 1)) or \ (args.test_during_training and ((epoch % args.test_frequency) == 0)): print("Running test set...") test_elbo = 0 with torch.no_grad(): for idx, data in enumerate(test_data_loader): _, elbo = args.loss(generative_model, inference_network, data, args, args.test_S) test_elbo += elbo.item() test_elbo = test_elbo / len(test_data_loader) log_scalar("test.elbo", test_elbo, epoch) # ------ end of training loop --------- # Save trained model if args.checkpoint: save_checkpoint(generative_model, inference_network, epoch, epoch_train_elbo, optimizer, args) if args.train_only: return None else: results = {"test_elbo": test_elbo if not args.train_only else None} return results