def test_prior(self, cuda=False): train_x, test_x, train_y, test_y = self._get_data(cuda=cuda) # We're manually going to set the hyperparameters to be ridiculous likelihood = GaussianLikelihood( noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1), noise_constraint=Positive(), # Prior for this test is looser than default bound ) gp_model = ExactGPModel(None, None, likelihood) # Update lengthscale prior to accommodate extreme parameters gp_model.covar_module.base_kernel.register_prior( "lengthscale_prior", SmoothedBoxPrior(exp(-10), exp(10), sigma=0.5), "raw_lengthscale" ) gp_model.mean_module.initialize(constant=1.5) gp_model.covar_module.base_kernel.initialize(lengthscale=1) likelihood.initialize(noise=0) if cuda: gp_model.cuda() likelihood.cuda() # Compute posterior distribution gp_model.eval() likelihood.eval() # The model should predict in prior mode function_predictions = likelihood(gp_model(train_x)) correct_variance = gp_model.covar_module.outputscale + likelihood.noise self.assertAllClose(function_predictions.mean, torch.full_like(function_predictions.mean, fill_value=1.5)) self.assertAllClose( function_predictions.variance, correct_variance.squeeze().expand_as(function_predictions.variance) )
def test_indexed_train_and_eval(self): likelihood = GaussianLikelihood() model = LMCModel() # Find optimal model hyperparameters model.train() likelihood.train() optimizer = torch.optim.Adam( [ { "params": model.parameters() }, { "params": likelihood.parameters() }, ], lr=0.01, ) # Our loss object. We're using the VariationalELBO, which essentially just computes the ELBO mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0)) # Create some task indices arange = torch.arange(train_x.size(0)) train_i = torch.rand(train_x.size(0)).mul(4).floor().long() # We use more CG iterations here because the preconditioner introduced in the NeurIPS paper seems to be less # effective for VI. for i in range(400): # Within each iteration, we will go over each minibatch of data optimizer.zero_grad() output = model(train_x, task_indices=train_i) loss = -mll(output, train_y[arange, train_i]) loss.backward() optimizer.step() for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model model.eval() likelihood.eval() # Make predictions for both sets of test points, and check MAEs. with torch.no_grad(), gpytorch.settings.max_eager_kernel_size(1): predictions = likelihood(model(train_x, task_indices=train_i)) mean_abs_error = torch.mean( torch.abs(train_y[arange, train_i] - predictions.mean)) self.assertLess(mean_abs_error.squeeze().item(), 0.15) # Smoke test for getting predictive uncertainties lower, upper = predictions.confidence_region() self.assertEqual(lower.shape, train_i.shape) self.assertEqual(upper.shape, train_i.shape)
def test_kissgp_gp_mean_abs_error_cuda(): if torch.cuda.is_available(): train_x, train_y, test_x, test_y = make_data(cuda=True) likelihood = GaussianLikelihood().cuda() gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood).cuda() mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1) optimizer.n_iter = 0 for i in range(25): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean() mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) assert(mean_abs_error.data.squeeze()[0] < 0.02)
def test_posterior_latent_gp_and_likelihood_without_optimization(self): # We're manually going to set the hyperparameters to be ridiculous likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3)) gp_model = ExactGPModel(train_x.data, train_y.data, likelihood) # Update bounds to accommodate extreme parameters gp_model.covar_module.set_bounds(log_lengthscale=(-10, 10)) likelihood.set_bounds(log_noise=(-10, 10)) # Update parameters gp_model.covar_module.initialize(log_lengthscale=-10) gp_model.mean_module.initialize(constant=0) likelihood.initialize(log_noise=-10) # Compute posterior distribution gp_model.eval() likelihood.eval() # Let's see how our model does, conditioned with weird hyperparams # The posterior should fit all the data function_predictions = likelihood(gp_model(train_x)) self.assertLess( torch.norm(function_predictions.mean().data - train_y.data), 1e-3, ) self.assertLess(torch.norm(function_predictions.var().data), 1e-3) # It shouldn't fit much else though test_function_predictions = gp_model(Variable(torch.Tensor([1.1]))) self.assertLess( torch.norm(test_function_predictions.mean().data - 0), 1e-4, ) self.assertLess(torch.norm(test_function_predictions.var().data - 1), 1e-4)
def test_posterior_latent_gp_and_likelihood_with_optimization(self): # We're manually going to set the hyperparameters to something they shouldn't be likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3)) gp_model = ExactGPModel(train_x.data, train_y.data, likelihood) mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model) gp_model.covar_module.initialize(log_lengthscale=1) gp_model.mean_module.initialize(constant=0) likelihood.initialize(log_noise=1) # Find optimal model hyperparameters gp_model.train() likelihood.train() optimizer = optim.Adam( list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1, ) optimizer.n_iter = 0 for _ in range(50): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() # Test the model gp_model.eval() likelihood.eval() test_function_predictions = likelihood(gp_model(test_x)) mean_abs_error = torch.mean( torch.abs(test_y - test_function_predictions.mean())) self.assertLess(mean_abs_error.data.squeeze()[0], 0.05)
def test_train_on_batch_test_on_batch(self): # We're manually going to set the hyperparameters to something they shouldn't be likelihood = GaussianLikelihood( noise_prior=gpytorch.priors.NormalPrior(loc=torch.zeros(2), scale=torch.ones(2)), batch_shape=torch.Size([2])) gp_model = ExactGPModel(train_x12, train_y12, likelihood, batch_shape=torch.Size([2])) mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model) # Find optimal model hyperparameters gp_model.train() likelihood.train() optimizer = optim.Adam(gp_model.parameters(), lr=0.1) for _ in range(50): optimizer.zero_grad() output = gp_model(train_x12) loss = -mll(output, train_y12, train_x12).sum() loss.backward() optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model gp_model.eval() likelihood.eval() # First test on non-batch non_batch_predictions = likelihood(gp_model(test_x1)) preds1 = non_batch_predictions.mean mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1[0])) self.assertLess(mean_abs_error1.squeeze().item(), 0.1) # Make predictions for both sets of test points, and check MAEs. batch_predictions = likelihood(gp_model(test_x12)) preds1 = batch_predictions.mean[0] preds2 = batch_predictions.mean[1] mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1)) mean_abs_error2 = torch.mean(torch.abs(test_y2 - preds2)) self.assertLess(mean_abs_error1.squeeze().item(), 0.1) self.assertLess(mean_abs_error2.squeeze().item(), 0.1) # Smoke test for batch mode derivatives failing test_x_param = torch.nn.Parameter(test_x12.data) batch_predictions = likelihood(gp_model(test_x_param)) batch_predictions.mean.sum().backward() self.assertTrue(test_x_param.grad is not None) # Smoke test for non-batch mode derivatives failing test_x_param = torch.nn.Parameter(test_x1.data) batch_predictions = likelihood(gp_model(test_x_param)) batch_predictions.mean.sum().backward() self.assertTrue(test_x_param.grad is not None)
def test_kissgp_gp_mean_abs_error(self): likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.2) optimizer.n_iter = 0 for _ in range(15): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) optimizer.step() # Test the model gp_model.eval() likelihood.eval() with gpytorch.fast_pred_var(): test_preds = likelihood(gp_model(test_x)).mean() mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.data.squeeze().item(), 0.15)
def test_regression_error(self, cuda=False, skip_logdet_forward=False, cholesky=False): train_x, train_y = train_data(cuda=cuda) likelihood = GaussianLikelihood() model = SVGPRegressionModel(torch.linspace(0, 1, 25)) mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_y)) if cuda: likelihood = likelihood.cuda() model = model.cuda() mll = mll.cuda() # Find optimal model hyperparameters model.train() likelihood.train() optimizer = optim.Adam([{ "params": model.parameters() }, { "params": likelihood.parameters() }], lr=0.01) _wrapped_cg = MagicMock(wraps=gpytorch.utils.linear_cg) with gpytorch.settings.max_cholesky_size( math.inf if cholesky else 0 ), gpytorch.settings.skip_logdet_forward( skip_logdet_forward), warnings.catch_warnings( record=True) as w, patch( "gpytorch.utils.linear_cg", new=_wrapped_cg) as linear_cg_mock: for _ in range(150): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() # Make sure CG was called (or not), and no warnings were thrown self.assertEqual(len(w), 0) if cholesky: self.assertFalse(linear_cg_mock.called) else: self.assertTrue(linear_cg_mock.called) for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Set back to eval mode model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2) self.assertLess(mean_abs_error.item(), 1e-1)
def train_gp(train_x, train_y, use_ard, num_steps, hypers={}): """Fit a GP model where train_x is in [0, 1]^d and train_y is standardized.""" assert train_x.ndim == 2 assert train_y.ndim == 1 assert train_x.shape[0] == train_y.shape[0] # Create hyper parameter bounds noise_constraint = Interval(5e-4, 0.2) if use_ard: lengthscale_constraint = Interval(0.005, 2.0) else: lengthscale_constraint = Interval(0.005, math.sqrt( train_x.shape[1])) # [0.005, sqrt(dim)] outputscale_constraint = Interval(0.05, 20.0) # Create models likelihood = GaussianLikelihood(noise_constraint=noise_constraint).to( device=train_x.device, dtype=train_y.dtype) ard_dims = train_x.shape[1] if use_ard else None model = GP( train_x=train_x, train_y=train_y, likelihood=likelihood, lengthscale_constraint=lengthscale_constraint, outputscale_constraint=outputscale_constraint, ard_dims=ard_dims, ).to(device=train_x.device, dtype=train_x.dtype) # Find optimal model hyperparameters model.train() likelihood.train() # "Loss" for GPs - the marginal log likelihood mll = ExactMarginalLogLikelihood(likelihood, model) # Initialize model hypers if hypers: model.load_state_dict(hypers) else: hypers = {} hypers["covar_module.outputscale"] = 1.0 hypers["covar_module.base_kernel.lengthscale"] = 0.5 hypers["likelihood.noise"] = 0.005 model.initialize(**hypers) # Use the adam optimizer optimizer = torch.optim.Adam([{"params": model.parameters()}], lr=0.1) for _ in range(num_steps): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() # Switch to eval mode model.eval() likelihood.eval() return model
def test_regression_error_full(self, skip_logdet_forward=False): train_x, train_y = train_data() likelihood = GaussianLikelihood() model = SVGPRegressionModel(inducing_points=train_x, learn_locs=False) mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_y)) # Find optimal model hyperparameters model.train() likelihood.train() optimizer = optim.Adam([{"params": model.parameters()}, {"params": likelihood.parameters()}], lr=0.01) with gpytorch.settings.skip_logdet_forward(skip_logdet_forward): for _ in range(200): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Set back to eval mode model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2) self.assertLess(mean_abs_error.item(), 1e-1)
def test_kissgp_gp_fast_pred_var(): with gpytorch.fast_pred_var(): train_x, train_y, test_x, test_y = make_data() likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1) optimizer.n_iter = 0 for i in range(25): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() # Test the model gp_model.eval() likelihood.eval() # Set the cache test_function_predictions = likelihood(gp_model(train_x)) # Now bump up the likelihood to something huge # This will make it easy to calculate the variance likelihood.log_noise.data.fill_(3) test_function_predictions = likelihood(gp_model(train_x)) noise = likelihood.log_noise.exp() var_diff = (test_function_predictions.var() - noise).abs() assert(torch.max(var_diff.data / noise.data) < 0.05)
def test_kissgp_gp_mean_abs_error(self): likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x, train_y, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() with gpytorch.settings.max_preconditioner_size( 5), gpytorch.settings.use_toeplitz(True): optimizer = optim.Adam(gp_model.parameters(), lr=0.1) optimizer.n_iter = 0 for _ in range(8): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model # Use the other toeplitz option here for testing with gpytorch.settings.max_preconditioner_size( 5), gpytorch.settings.use_toeplitz(True): gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.squeeze().item(), 0.2)
def test_prior(self, cuda=False): train_x, test_x, train_y, test_y = self._get_data(cuda=cuda) # We're manually going to set the hyperparameters to be ridiculous likelihood = GaussianLikelihood( noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1)) gp_model = ExactGPModel(None, None, likelihood) # Update lengthscale prior to accommodate extreme parameters gp_model.covar_module.base_kernel.register_prior( "lengthscale_prior", SmoothedBoxPrior(exp(-10), exp(10), sigma=0.5), "raw_lengthscale") gp_model.mean_module.initialize(constant=1.5) gp_model.covar_module.base_kernel.initialize(lengthscale=1) likelihood.initialize(noise=0) if cuda: gp_model.cuda() likelihood.cuda() # Compute posterior distribution gp_model.eval() likelihood.eval() # The model should predict in prior mode function_predictions = likelihood(gp_model(train_x)) correct_variance = gp_model.covar_module.outputscale + likelihood.noise self.assertLess(torch.norm(function_predictions.mean - 1.5), 1e-3) self.assertLess( torch.norm(function_predictions.variance - correct_variance), 1e-3)
def test_spectral_mixture_gp_mean_abs_error(self): likelihood = GaussianLikelihood(log_noise_bounds=(-5, 5)) gp_model = SpectralMixtureGPModel(train_x.data, train_y.data, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam( list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1, ) optimizer.n_iter = 0 with gpytorch.settings.num_trace_samples(100): for _ in range(50): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean() mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) # The spectral mixture kernel should be trivially able to # extrapolate the sine function. self.assertLess(mean_abs_error.data.squeeze()[0], 0.15)
def test_fantasy_updates_batch(self, cuda=False): train_x, test_x, train_y, test_y = self._get_data(cuda=cuda) # We're manually going to set the hyperparameters to something they shouldn't be likelihood = GaussianLikelihood() gp_model = ExactGPModel(train_x, train_y, likelihood) mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model) gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1)) gp_model.mean_module.initialize(constant=0) likelihood.initialize(noise=exp(1)) if cuda: gp_model.cuda() likelihood.cuda() # Find optimal model hyperparameters gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.15) optimizer.n_iter = 0 for _ in range(50): optimizer.zero_grad() with gpytorch.settings.debug(False): output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) optimizer.step() with gpytorch.settings.fast_pred_var(): # Test the model gp_model.eval() likelihood.eval() test_function_predictions = likelihood(gp_model(test_x)) # Cut data down, and then add back via the fantasy interface gp_model.set_train_data(train_x[:5], train_y[:5], strict=False) likelihood(gp_model(test_x)) fantasy_x = train_x[5:].clone().unsqueeze(0).unsqueeze(-1).repeat( 3, 1, 1).requires_grad_(True) fantasy_y = train_y[5:].unsqueeze(0).repeat(3, 1) fant_model = gp_model.get_fantasy_model(fantasy_x, fantasy_y) fant_function_predictions = likelihood(fant_model(test_x)) self.assertTrue( approx_equal(test_function_predictions.mean, fant_function_predictions.mean[0])) fant_function_predictions.mean.sum().backward() self.assertTrue(fantasy_x.grad is not None)
def test_kissgp_gp_mean_abs_error(self): likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x, train_y, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) with gpytorch.settings.max_preconditioner_size(10), gpytorch.settings.max_cg_iterations(50): with gpytorch.beta_features.fast_pred_var(): # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(gp_model.parameters(), lr=0.01) optimizer.n_iter = 0 for _ in range(15): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean() mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.squeeze().item(), 0.2)
def test_posterior_latent_gp_and_likelihood_without_optimization( self, cuda=False): train_x, test_x, train_y, test_y = self._get_data(cuda=cuda) # We're manually going to set the hyperparameters to be ridiculous likelihood = GaussianLikelihood() gp_model = ExactGPModel(train_x, train_y, likelihood) gp_model.covar_module.base_kernel.initialize(lengthscale=exp(-15)) likelihood.initialize(noise=exp(-15)) if cuda: gp_model.cuda() likelihood.cuda() # Compute posterior distribution gp_model.eval() likelihood.eval() # Let's see how our model does, conditioned with weird hyperparams # The posterior should fit all the data with gpytorch.settings.debug(False): function_predictions = likelihood(gp_model(train_x)) self.assertLess(torch.norm(function_predictions.mean - train_y), 1e-3) self.assertLess(torch.norm(function_predictions.variance), 1e-3) # It shouldn't fit much else though test_function_predictions = gp_model( torch.tensor([1.1]).type_as(test_x)) self.assertLess(torch.norm(test_function_predictions.mean - 0), 1e-4) self.assertLess( torch.norm(test_function_predictions.variance - gp_model.covar_module.outputscale), 1e-4)
def test_gp_posterior_single_training_point_smoke_test(self): train_x, test_x, train_y, _ = self._get_data() train_x = train_x[0].unsqueeze(-1).unsqueeze(-1) train_y = train_y[0].unsqueeze(-1) likelihood = GaussianLikelihood() gp_model = ExactGPModel(train_x, train_y, likelihood) gp_model.eval() likelihood.eval() with gpytorch.settings.fast_pred_var(): preds = gp_model(test_x) single_mean = preds.mean single_variance = preds.variance self.assertFalse(torch.any(torch.isnan(single_variance))) self.assertFalse(torch.any(torch.isnan(single_mean))) gp_model.train() gp_model.eval() preds = gp_model(test_x) single_mean = preds.mean single_variance = preds.variance self.assertFalse(torch.any(torch.isnan(single_variance))) self.assertFalse(torch.any(torch.isnan(single_mean)))
def test_loading_old_model(self): train_x, train_y = train_data(cuda=False) likelihood = GaussianLikelihood() model = SVGPRegressionModel( torch.linspace(0, 1, 25), gpytorch.variational.CholeskyVariationalDistribution) data_file = Path(__file__).parent.joinpath( "old_variational_strategy_model.pth").resolve() state_dicts = torch.load(data_file) likelihood.load_state_dict(state_dicts["likelihood"], strict=False) # Ensure we get a warning with warnings.catch_warnings(record=True) as ws: # Makes sure warnings we catch don't cause `-w error` to fail warnings.simplefilter("always", OldVersionWarning) model.load_state_dict(state_dicts["model"]) self.assertTrue( any(issubclass(w.category, OldVersionWarning) for w in ws)) with torch.no_grad(): model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2) self.assertLess(mean_abs_error.item(), 1e-1)
def test_regression_error_shared_inducing_locations(self): train_x, train_y = train_data() likelihood = GaussianLikelihood() inducing_points = torch.linspace(0, 1, 25).unsqueeze(-1) model = SVGPRegressionModel(inducing_points) mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(-1)) # Find optimal model hyperparameters model.train() likelihood.train() optimizer = optim.Adam([{"params": model.parameters()}, {"params": likelihood.parameters()}], lr=0.01) for _ in range(200): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss = loss.sum() loss.backward() optimizer.step() for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Set back to eval mode model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y[0, :] - test_preds[0, :]) / 2) mean_abs_error2 = torch.mean(torch.abs(train_y[1, :] - test_preds[1, :]) / 2) self.assertLess(mean_abs_error.item(), 1e-1) self.assertLess(mean_abs_error2.item(), 1e-1)
def test_sgpr_mean_abs_error(self): train_x, train_y, test_x, test_y = make_data() likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x, train_y, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(gp_model.parameters(), lr=0.1) for _ in range(30): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.squeeze().item(), 0.05)
def test_kissgp_gp_mean_abs_error(self): likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x.data, train_y.data, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam( list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.2, ) optimizer.n_iter = 0 for _ in range(20): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean() mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.data.squeeze()[0], 0.1)
def test_posterior_latent_gp_and_likelihood_without_optimization(self): # We're manually going to set the hyperparameters to be ridiculous likelihood = GaussianLikelihood(log_noise_prior=SmoothedBoxPrior( exp(-3), exp(3), sigma=0.1, log_transform=True)) gp_model = ExactGPModel(train_x, train_y, likelihood) # Update lengthscale prior to accommodate extreme parameters gp_model.covar_module.set_parameter_priors( log_lengthscale=SmoothedBoxPrior( exp(-10), exp(10), sigma=0.5, log_transform=True)) gp_model.covar_module.initialize(log_lengthscale=-10) likelihood.initialize(log_noise=-10) # Compute posterior distribution gp_model.eval() likelihood.eval() # Let's see how our model does, conditioned with weird hyperparams # The posterior should fit all the data with gpytorch.settings.debug(False): function_predictions = likelihood(gp_model(train_x)) self.assertLess(torch.norm(function_predictions.mean() - train_y), 1e-3) self.assertLess(torch.norm(function_predictions.var()), 1e-3) # It shouldn't fit much else though test_function_predictions = gp_model(torch.Tensor([1.1])) self.assertLess(torch.norm(test_function_predictions.mean() - 0), 1e-4) self.assertLess(torch.norm(test_function_predictions.var() - 1), 1e-4)
def test_kissgp_gp_mean_abs_error(self): train_x, train_y, test_x, test_y = make_data() train_dataset = TensorDataset(train_x, train_y) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64) model = GPRegressionModel() likelihood = GaussianLikelihood() mll = gpytorch.mlls.VariationalMarginalLogLikelihood( likelihood, model, num_data=len(train_y)) # We use SGD here, rather than Adam # Emperically, we find that SGD is better for variational regression optimizer = torch.optim.Adam([{ "params": model.parameters() }, { "params": likelihood.parameters() }], lr=0.01) # Our loss object # We're using the VariationalMarginalLogLikelihood object mll = gpytorch.mlls.VariationalMarginalLogLikelihood( likelihood, model, num_data=train_y.size(0)) # The training loop def train(n_epochs=15): # We use a Learning rate scheduler from PyTorch to lower the learning rate during optimization # We're going to drop the learning rate by 1/10 after 3/4 of training # This helps the model converge to a minimum scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[0.75 * n_epochs], gamma=0.1) for _ in range(n_epochs): scheduler.step() for x_batch, y_batch in train_loader: x_batch = x_batch.float() y_batch = y_batch.float() optimizer.zero_grad() output = model(x_batch) loss = -mll(output, y_batch) loss.backward() optimizer.step() train() for _, param in model.named_parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model model.eval() likelihood.eval() test_preds = likelihood(model(test_x)).mean mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.squeeze().item(), 0.1)
def test_sgpr_mean_abs_error(self): # Suppress numerical warnings warnings.simplefilter("ignore", NumericalWarning) train_x, train_y, test_x, test_y = make_data() likelihood = GaussianLikelihood() gp_model = GPRegressionModel(train_x, train_y, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model) # Optimize the model gp_model.train() likelihood.train() optimizer = optim.Adam(gp_model.parameters(), lr=0.1) for _ in range(30): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() # Check that we have the right LazyTensor type kernel = likelihood( gp_model(train_x)).lazy_covariance_matrix.evaluate_kernel() self.assertIsInstance(kernel, gpytorch.lazy.LowRankRootAddedDiagLazyTensor) for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(test_x)).mean mean_abs_error = torch.mean(torch.abs(test_y - test_preds)) self.assertLess(mean_abs_error.squeeze().item(), 0.1) # Test variances test_vars = likelihood(gp_model(test_x)).variance self.assertAllClose( test_vars, likelihood(gp_model(test_x)).covariance_matrix.diagonal(dim1=-1, dim2=-2)) self.assertGreater(test_vars.min().item() + 0.1, likelihood.noise.item()) self.assertLess( test_vars.max().item() - 0.05, likelihood.noise.item() + gp_model.covar_module.base_kernel.outputscale.item()) # Test on training data test_outputs = likelihood(gp_model(train_x)) self.assertLess((test_outputs.mean - train_y).max().item(), 0.1) self.assertLess(test_outputs.variance.max().item(), likelihood.noise.item() * 2)
def test_posterior_latent_gp_and_likelihood_fast_pred_var( self, cuda=False): train_x, test_x, train_y, test_y = self._get_data(cuda=cuda) with gpytorch.settings.fast_pred_var(), gpytorch.settings.debug(False): # We're manually going to set the hyperparameters to # something they shouldn't be likelihood = GaussianLikelihood( noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1)) gp_model = ExactGPModel(train_x, train_y, likelihood) mll = gpytorch.mlls.ExactMarginalLogLikelihood( likelihood, gp_model) gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1)) gp_model.mean_module.initialize(constant=0) likelihood.initialize(noise=exp(1)) if cuda: gp_model.cuda() likelihood.cuda() # Find optimal model hyperparameters gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1) optimizer.n_iter = 0 for _ in range(50): optimizer.zero_grad() output = gp_model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.n_iter += 1 optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) optimizer.step() # Test the model gp_model.eval() likelihood.eval() # Set the cache test_function_predictions = likelihood(gp_model(train_x)) # Now bump up the likelihood to something huge # This will make it easy to calculate the variance likelihood.noise_covar.raw_noise.data.fill_(3) test_function_predictions = likelihood(gp_model(train_x)) noise = likelihood.noise_covar.noise var_diff = (test_function_predictions.variance - noise).abs() self.assertLess(torch.max(var_diff / noise), 0.05)
def test_kissgp_gp_mean_abs_error(self): train_x, train_y, test_x, test_y = make_data() train_dataset = TensorDataset(train_x, train_y) loader = DataLoader(train_dataset, shuffle=True, batch_size=64) gp_model = GPRegressionModel() likelihood = GaussianLikelihood() mll = gpytorch.mlls.VariationalMarginalLogLikelihood( likelihood, gp_model, n_data=len(train_y), ) # Optimize the model gp_model.train() likelihood.train() with gpytorch.beta_features.diagonal_correction(): optimizer = optim.SGD( list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1, ) scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[15], gamma=0.1, ) for _ in range(20): scheduler.step() for x_batch, y_batch in loader: x_batch = Variable(x_batch.float()) y_batch = Variable(y_batch.float()) optimizer.zero_grad() output = gp_model(x_batch) loss = -mll(output, y_batch) loss.backward() optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) optimizer.step() # Test the model gp_model.eval() likelihood.eval() test_preds = likelihood(gp_model(Variable(test_x))).mean() mean_abs_error = torch.mean( torch.abs(Variable(test_y) - test_preds)) self.assertLess(mean_abs_error.data.squeeze().item(), 0.1)
def test_regression_error( self, cuda=False, mll_cls=gpytorch.mlls.VariationalELBO, distribution_cls=gpytorch.variational.CholeskyVariationalDistribution, ): train_x, train_y = train_data(cuda=cuda) likelihood = GaussianLikelihood() model = SVGPRegressionModel(torch.linspace(0, 1, 25), distribution_cls) mll = mll_cls(likelihood, model, num_data=len(train_y)) if cuda: likelihood = likelihood.cuda() model = model.cuda() mll = mll.cuda() # Find optimal model hyperparameters model.train() likelihood.train() optimizer = optim.Adam([{ "params": model.parameters() }, { "params": likelihood.parameters() }], lr=0.01) for _ in range(200): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Set back to eval mode model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2) self.assertLess(mean_abs_error.item(), 0.014) if distribution_cls is gpytorch.variational.CholeskyVariationalDistribution: # finally test fantasization # we only will check that tossing the entire training set into the model will reduce the mae model.likelihood = likelihood fant_model = model.get_fantasy_model(train_x, train_y) fant_preds = fant_model.likelihood( fant_model(train_x)).mean.squeeze() updated_abs_error = torch.mean(torch.abs(train_y - fant_preds) / 2) # TODO: figure out why this error is worse than before self.assertLess(updated_abs_error.item(), 0.15)
def test_posterior_latent_gp_and_likelihood_with_optimization(self): # We're manually going to set the hyperparameters to something they shouldn't be likelihood = GaussianLikelihood(log_noise_bounds=(-3, 3)) gp_model = ExactGPModel(train_x1.data, train_y1.data, likelihood) mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model) gp_model.covar_module.initialize(log_lengthscale=1) gp_model.mean_module.initialize(constant=0) likelihood.initialize(log_noise=1) # Find optimal model hyperparameters gp_model.train() likelihood.train() optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1) optimizer.n_iter = 0 for _ in range(50): optimizer.zero_grad() output = gp_model(train_x1) loss = -mll(output, train_y1) loss.backward() optimizer.n_iter += 1 optimizer.step() for param in gp_model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) optimizer.step() # Test the model gp_model.eval() likelihood.eval() # Create data batches train_x12 = torch.cat((train_x1.unsqueeze(0), train_x2.unsqueeze(0)), dim=0).contiguous() train_y12 = torch.cat((train_y1.unsqueeze(0), train_y2.unsqueeze(0)), dim=0).contiguous() test_x12 = torch.cat((test_x1.unsqueeze(0), test_x2.unsqueeze(0)), dim=0).contiguous() # Update gp model to use both sine and cosine training data as train data gp_model.set_train_data(train_x12, train_y12, strict=False) # Make predictions for both sets of test points, and check MAEs. batch_predictions = likelihood(gp_model(test_x12)) preds1 = batch_predictions.mean()[0] preds2 = batch_predictions.mean()[1] mean_abs_error1 = torch.mean(torch.abs(test_y1 - preds1)) mean_abs_error2 = torch.mean(torch.abs(test_y2 - preds2)) self.assertLess(mean_abs_error1.data.squeeze().item(), 0.05) self.assertLess(mean_abs_error2.data.squeeze().item(), 0.05)
def test_regression_error( self, cuda=False, mll_cls=gpytorch.mlls.VariationalELBO, distribution_cls=gpytorch.variational.CholeskyVariationalDistribution, ): train_x, train_y = train_data(cuda=cuda) likelihood = GaussianLikelihood() model = SVGPRegressionModel(torch.linspace(0, 1, 25), distribution_cls) mll = mll_cls(likelihood, model, num_data=len(train_y)) if cuda: likelihood = likelihood.cuda() model = model.cuda() mll = mll.cuda() # Find optimal model hyperparameters model.train() likelihood.train() optimizer = optim.Adam([{ "params": model.parameters() }, { "params": likelihood.parameters() }], lr=0.01) _wrapped_cg = MagicMock(wraps=gpytorch.utils.linear_cg) _cg_mock = patch("gpytorch.utils.linear_cg", new=_wrapped_cg) with warnings.catch_warnings(record=True) as ws, _cg_mock as cg_mock: for _ in range(150): optimizer.zero_grad() output = model(train_x) loss = -mll(output, train_y) loss.backward() optimizer.step() for param in model.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) for param in likelihood.parameters(): self.assertTrue(param.grad is not None) self.assertGreater(param.grad.norm().item(), 0) # Set back to eval mode model.eval() likelihood.eval() test_preds = likelihood(model(train_x)).mean.squeeze() mean_abs_error = torch.mean(torch.abs(train_y - test_preds) / 2) self.assertLess(mean_abs_error.item(), 1e-1) # Make sure CG was called (or not), and no warnings were thrown self.assertFalse(cg_mock.called) self.assertFalse( any( issubclass(w.category, ExtraComputationWarning) for w in ws))