def test_transforms(self): train_x = torch.rand(10, 3, device=self.device) train_y = torch.randn(10, 4, 5, device=self.device) # test handling of Standardize with self.assertWarns(RuntimeWarning): model = HigherOrderGP(train_X=train_x, train_Y=train_y, outcome_transform=Standardize(m=5)) self.assertIsInstance(model.outcome_transform, FlattenedStandardize) self.assertEqual(model.outcome_transform.output_shape, train_y.shape[1:]) self.assertEqual(model.outcome_transform.batch_shape, torch.Size()) model = HigherOrderGP( train_X=train_x, train_Y=train_y, input_transform=Normalize(d=3), outcome_transform=FlattenedStandardize(train_y.shape[1:]), ) mll = ExactMarginalLogLikelihood(model.likelihood, model) fit_gpytorch_torch(mll, options={"maxiter": 1, "disp": False}) test_x = torch.rand(2, 5, 3, device=self.device) test_y = torch.randn(2, 5, 4, 5, device=self.device) posterior = model.posterior(test_x) self.assertIsInstance(posterior, TransformedPosterior) conditioned_model = model.condition_on_observations(test_x, test_y) self.assertIsInstance(conditioned_model, HigherOrderGP) self.check_transform_forward(model) self.check_transform_untransform(model)
def setUp(self): super().setUp() torch.random.manual_seed(0) train_x = torch.rand(2, 10, 1, device=self.device) train_y = torch.randn(2, 10, 3, 5, device=self.device) self.model = HigherOrderGP(train_x, train_y) # check that we can assign different kernels and likelihoods model_2 = HigherOrderGP( train_X=train_x, train_Y=train_y, covar_modules=[RBFKernel(), RBFKernel(), RBFKernel()], likelihood=GaussianLikelihood(), ) model_3 = HigherOrderGP( train_X=train_x, train_Y=train_y, covar_modules=[RBFKernel(), RBFKernel(), RBFKernel()], likelihood=GaussianLikelihood(), latent_init="gp", ) for m in [self.model, model_2, model_3]: mll = ExactMarginalLogLikelihood(m.likelihood, m) fit_gpytorch_torch(mll, options={"maxiter": 1, "disp": False})
def setUp(self): super().setUp() manual_seed(0) train_x = rand(2, 10, 1) train_y = randn(2, 10, 3, 5) train_x = train_x.to(device=self.device) train_y = train_y.to(device=self.device) self.model = HigherOrderGP(train_x, train_y, first_dim_is_batch=True) # check that we can assign different kernels and likelihoods model_2 = HigherOrderGP( train_x, train_y, first_dim_is_batch=True, covar_modules=[RBFKernel(), RBFKernel(), RBFKernel()], likelihood=GaussianLikelihood(), ) for m in [self.model, model_2]: mll = ExactMarginalLogLikelihood(m.likelihood, m) fit_gpytorch_torch(mll, options={"maxiter": 1, "disp": False})
def test_transforms(self): train_x = rand(10, 3, device=self.device) train_y = randn(10, 4, 5, device=self.device) model = HigherOrderGP( train_x, train_y, input_transform=Normalize(d=3), outcome_transform=FlattenedStandardize(train_y.shape[1:]), ) mll = ExactMarginalLogLikelihood(model.likelihood, model) fit_gpytorch_torch(mll, options={"maxiter": 1, "disp": False}) test_x = rand(2, 5, 3, device=self.device) test_y = randn(2, 5, 4, 5, device=self.device) posterior = model.posterior(test_x) self.assertIsInstance(posterior, TransformedPosterior) conditioned_model = model.condition_on_observations(test_x, test_y) self.assertIsInstance(conditioned_model, HigherOrderGP) self.check_transform_forward(model) self.check_transform_untransform(model)
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data( args.data_loc, args.num_init, args.num_total, test_is_year=False, seed=args.seed, ) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) if args.model == "wiski": model = FixedNoiseOnlineSKIGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), GridInterpolationKernel( base_kernel=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), grid_size=30, num_dims=2, grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]), ), learn_additional_noise=False, ).to(device) mll_type = lambda x, y: BatchedWoodburyMarginalLogLikelihood( x, y, clear_caches_every_iteration=True) elif args.model == "exact": model = FixedNoiseGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), ).to(device) mll_type = ExactMarginalLogLikelihood mll = mll_type(model.likelihood, model) print("---- Fitting initial model ----") start = time.time() model.train() model.zero_grad() # with max_cholesky_size(args.cholesky_size), skip_logdet_forward(True), \ # use_toeplitz(args.toeplitz), max_root_decomposition_size(args.sketch_size): fit_gpytorch_torch(mll, options={"lr": 0.1, "maxiter": 1000}) end = time.time() print("Elapsed fitting time: ", end - start) print("Named parameters: ", list(model.named_parameters())) print("--- Now computing initial RMSE") model.eval() with gpytorch.settings.skip_posterior_variances(True): test_pred = model(test_x) pred_rmse = ((test_pred.mean - test_y)**2).mean().sqrt() print("---- Initial RMSE: ", pred_rmse.item()) all_outputs = [] start_ind = init_x.shape[0] end_ind = int(start_ind + args.batch_size) for step in range(args.num_steps): if step > 0 and step % 25 == 0: print("Beginning step ", step) total_time_step_start = time.time() if step > 0: print("---- Fitting model ----") start = time.time() model.train() model.zero_grad() mll = mll_type(model.likelihood, model) # with skip_logdet_forward(True), max_root_decomposition_size( # args.sketch_size # ), max_cholesky_size(args.cholesky_size), use_toeplitz( # args.toeplitz # ): fit_gpytorch_torch(mll, options={ "lr": 0.01 * (0.99**step), "maxiter": 300 }) model.zero_grad() end = time.time() print("Elapsed fitting time: ", end - start) print("Named parameters: ", list(model.named_parameters())) if not args.random: if args.model == "wiski": botorch_model = OnlineSKIBotorchModel(model=model) else: botorch_model = model # qmc_sampler = SobolQMCNormalSampler(num_samples=4) bounds = torch.stack([torch.zeros(2), torch.ones(2)]).to(device) qnipv = qNIPV( model=botorch_model, mc_points=test_x, # sampler=qmc_sampler, ) #with use_toeplitz(args.toeplitz), root_pred_var(True), fast_pred_var(True): candidates, acq_value = optimize_acqf( acq_function=qnipv, bounds=bounds, q=args.batch_size, num_restarts=1, raw_samples=10, # used for intialization heuristic options={ "batch_limit": 5, "maxiter": 200 }, ) else: candidates = torch.rand(args.batch_size, train_x.shape[-1], device=device, dtype=train_x.dtype) acq_value = torch.zeros(1) model.eval() _ = model(test_x[:10]) # to init caches print("---- Finished optimizing; now querying dataset ---- ") with torch.no_grad(): covar_dists = model.covar_module(candidates, train_x) nearest_points = covar_dists.evaluate().argmax(dim=-1) new_x = train_x[nearest_points] new_y = train_y[nearest_points] new_y_var = train_y_var[nearest_points] todrop = torch.tensor( [x in nearest_points for x in range(train_x.shape[0])]) train_x, train_y, train_y_var = train_x[~todrop], train_y[ ~todrop], train_y_var[~todrop] print("New train_x shape", train_x.shape) print("--- Now updating model with simulator ----") model = model.condition_on_observations(X=new_x, Y=new_y.view(-1, 1), noise=new_y_var.view( -1, 1)) print("--- Now computing updated RMSE") model.eval() # with gpytorch.settings.fast_pred_var(True), \ # detach_test_caches(True), \ # max_root_decomposition_size(args.sketch_size), \ # max_cholesky_size(args.cholesky_size), \ # use_toeplitz(args.toeplitz), root_pred_var(True): test_pred = model(test_x) pred_rmse = ((test_pred.mean.view(-1) - test_y.view(-1))**2).mean().sqrt() pred_avg_variance = test_pred.variance.mean() total_time_step_elapsed_time = time.time() - total_time_step_start step_output_list = [ total_time_step_elapsed_time, acq_value.item(), pred_rmse.item(), pred_avg_variance.item() ] print("Step RMSE: ", pred_rmse) all_outputs.append(step_output_list) start_ind = end_ind end_ind = int(end_ind + args.batch_size) output_dict = { "model_state_dict": model.cpu().state_dict(), "queried_points": { 'x': model.cpu().train_inputs[0], 'y': model.cpu().train_targets }, "results": DataFrame(all_outputs) } torch.save(output_dict, args.output)
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data(args.data_loc, args.num_init, args.num_total, test_is_year=False) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) model = FixedNoiseOnlineSKIGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), GridInterpolationKernel( base_kernel=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), grid_size=30, num_dims=2, grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]), ), learn_additional_noise=False, ).to(device) mll = BatchedWoodburyMarginalLogLikelihood(model.likelihood, model) print("---- Fitting initial model ----") start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), use_toeplitz(args.toeplitz): fit_gpytorch_torch(mll, options={"lr": 0.1, "maxiter": 1000}) end = time.time() print("Elapsed fitting time: ", end - start) model.zero_grad() model.eval() print("--- Generating initial predictions on test set ----") start = time.time() with detach_test_caches(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): pred_dist = model(test_x) pred_mean = pred_dist.mean.detach() # pred_var = pred_dist.variance.detach() end = time.time() print("Elapsed initial prediction time: ", end - start) rmse_initial = ((pred_mean.view(-1) - test_y.view(-1))**2).mean().sqrt() print("Initial RMSE: ", rmse_initial.item()) optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) mll_time_list = [] rmse_list = [] for i in range(500, train_x.shape[0]): model.zero_grad() model.train() start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): loss = -mll(model(train_x[:i]), train_y[:i]).sum() loss.backward() mll_time = start - time.time() optimizer.step() model.zero_grad() optimizer.zero_grad() start = time.time() with torch.no_grad(): model.condition_on_observations( train_x[i].unsqueeze(0), train_y[i].view(1, 1), train_y_var[i].view(-1, 1), inplace=True, ) fantasy_time = start - time.time() mll_time_list.append([-mll_time, -fantasy_time]) if i % 25 == 0: start = time.time() model.eval() model.zero_grad() with detach_test_caches(), max_root_decomposition_size( args.sketch_size), max_cholesky_size(args.cholesky_size): pred_dist = model(test_x) end = time.time() rmse = (((pred_dist.mean - test_y.view(-1))**2).mean().sqrt().item()) rmse_list.append([rmse, end - start]) print("Current RMSE: ", rmse) print("Outputscale: ", model.covar_module.base_kernel.raw_outputscale) print( "Lengthscale: ", model.covar_module.base_kernel.base_kernel.raw_lengthscale, ) print("Step: ", i, "Train Loss: ", loss) optimizer.param_groups[0]["lr"] *= 0.9 torch.save({ "training": mll_time_list, "predictions": rmse_list }, args.output)
def main(args): if args.batch_size > 1 and args.acqf == "mves": raise NotImplementedError( "Cyclic optimization is not implemented for MVES currently. Please use a batch size of 1." ) if args.cuda and torch.cuda.is_available(): args.device = torch.device("cuda:0") else: args.device = torch.device("cpu") torch.random.manual_seed(args.seed) test_function = prepare_function(args, args.device) init_x, init_y, y_means, latent_y = initialize_random_data( test_function, args.device, args.num_init ) bounds = test_function.bounds.t() unit_bounds = torch.ones_like(bounds) unit_bounds[:, 0] = 0.0 noise = args.noise ** 2 * torch.ones_like(init_y) if args.fixed_noise else None if args.model == "wiski": def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), lengthscale_constraint=Interval(1e-4, 12.0), ), outputscale_prior=GammaPrior(2.0, 0.15), outputscale_constraint=Interval(1e-4, 12.0), ) else: covar_module = old_model.covar_module if args.dim == 3: wiski_grid_size = 10 elif args.dim == 2: wiski_grid_size = 30 kernel_cache = old_model._kernel_cache if old_model is not None else None model_obj = OnlineSKIBotorchModel( X, Y, train_noise_term=noise, grid_bounds=bounds, grid_size=wiski_grid_size, learn_additional_noise=True, kernel_cache=kernel_cache, covar_module=covar_module, ).to(X) mll = BatchedWoodburyMarginalLogLikelihood( model_obj.likelihood, model_obj, clear_caches_every_iteration=True ) # TODO: reload statedict here? # weird errors resulting return model_obj, mll elif args.model == "exact": def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), lengthscale_constraint=Interval(1e-4, 12.0), ), outputscale_prior=GammaPrior(2.0, 0.15), outputscale_constraint=Interval(1e-4, 12.0), ) if args.fixed_noise: model_obj = FixedNoiseGP( X, Y, train_Yvar=noise, covar_module=covar_module ) else: model_obj = SingleTaskGP(X, Y, covar_module=covar_module) else: model_obj = old_model mll = ExactMarginalLogLikelihood(model_obj.likelihood, model_obj) return model_obj, mll elif args.model == "osvgp": def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: if args.dim == 3: wiski_grid_size = 10 elif args.dim == 2: wiski_grid_size = 30 grid_list = create_grid([wiski_grid_size] * args.dim, grid_bounds=bounds) inducing_points = ( torch.stack([x.reshape(-1) for x in torch.meshgrid(grid_list)]) .t() .contiguous() .clone() ) likelihood = GaussianLikelihood() model_base = VariationalGPModel( inducing_points, likelihood=likelihood, beta=1.0, learn_inducing_locations=True, ) model_obj = ApproximateGPyTorchModel( model_base, likelihood, num_outputs=1 ) model_base.train_inputs = [X] model_base.train_targets = Y.view(-1) # we don't implement fixednoiseGaussian likelihoods for the streaming setting if args.fixed_noise: model_obj.likelihood.noise = args.noise ** 2 model_obj.likelihood.requires_grad = False else: model_obj = old_model model_obj.train_inputs = [X] model_obj.train_targets = Y.view(-1) mll = VariationalELBO( model_obj.likelihood, model_obj.model, num_data=X.shape[-2] ) return model_obj, mll train_x, train_y = init_x, init_y model_obj = None all_outputs = [] for step in range(args.num_steps): t0 = time.time() model_obj, mll = initialize_model(train_x, train_y, old_model=model_obj) model_obj = model_obj.to(train_x) # fitting with LBFGSB is really slow due to the inducing points if args.model != "osvgp": fit_gpytorch_model(mll) else: fit_gpytorch_torch(mll, options={"maxiter": 1000}) t0_total = time.time() - t0 acqf = prepare_acquisition_function( args, model_obj, train_x, train_y, bounds, step ) t1 = time.time() ( new_x_ei, new_obj_unstandardized, new_latent_obj, ) = optimize_acqf_and_get_observation( acqf, bounds=unit_bounds.t(), test_function_bounds=bounds.t(), batch_size=args.batch_size, test_function=test_function, ) new_obj_ei = (new_obj_unstandardized - y_means["mean"]) / y_means["std"] train_x = torch.cat((train_x, new_x_ei), dim=0) train_y = torch.cat((train_y, new_obj_ei), dim=0) latent_y = torch.cat((latent_y, new_latent_obj), dim=0) if noise is not None: new_noise = args.noise ** 2 * torch.ones_like(new_obj_ei) noise = torch.cat((noise, new_noise), dim=0) else: new_noise = None t1_total = time.time() - t1 t2 = time.time() if args.model != "osvgp": if args.fixed_noise: kwargs = {"noise": new_noise} else: kwargs = {} model_obj = model_obj.condition_on_observations( X=new_x_ei, Y=new_obj_ei, **kwargs ) if args.model == "osvgp": model_obj.model.update_variational_parameters( new_x=new_x_ei, new_y=new_obj_ei ) t2_total = time.time() - t2 total = t0_total + t1_total + t2_total max_achieved = train_y.max() * y_means["std"] + y_means["mean"] max_latent_achieved = latent_y.max() output_lists = [ t0_total, t1_total, t2_total, total, max_achieved.item(), max_latent_achieved.item(), ] all_outputs.append(output_lists) if step % (args.num_steps // 5) == 0: print( "Step ", step, " of ", args.num_steps, "Max Achieved: ", max_achieved.item(), "Max Latent Achieved: ", max_latent_achieved.item(), ) for key in y_means: y_means[key] = y_means[key].cpu() output_dict = { "observations": { "x": train_x.cpu(), "y": train_y.cpu(), "means": y_means, "latent_y": latent_y.cpu(), }, "results": DataFrame(all_outputs), "args": args } torch.save(output_dict, args.output)