def __init__(self, x: torch.Tensor, xe: torch.Tensor, y: torch.Tensor, lik: GaussianLikelihood, **conf): super().__init__((x, xe), y.squeeze(), lik) mean = conf.get('mean', ConstantMean()) kern = conf.get( 'kern', ScaleKernel(MaternKernel(nu=1.5, ard_num_dims=x.shape[1]), outputscale_prior=GammaPrior(0.5, 0.5))) kern_emb = conf.get('kern_emb', MaternKernel(nu=2.5)) self.multi_task = y.shape[1] > 1 self.mean = mean if not self.multi_task else MultitaskMean( mean, num_tasks=y.shape[1]) if x.shape[1] > 0: self.kern = kern if not self.multi_task else MultitaskKernel( kern, num_tasks=y.shape[1]) if xe.shape[1] > 0: assert 'num_uniqs' in conf num_uniqs = conf['num_uniqs'] emb_sizes = conf.get('emb_sizes', None) self.emb_trans = EmbTransform(num_uniqs, emb_sizes=emb_sizes) self.kern_emb = kern_emb if self.multi_task else MultitaskKernel( kern_emb, num_tasks=y.shape[1])
def __init__(self, mean_list, kernel_list, num_points=100, num_samples=1000, amplitude_range=(-5., 5.)): self.mean_list = mean_list self.kernel_list = kernel_list self.num_config = len(mean_list) * len(kernel_list) self.num_samples = num_samples self.num_points = num_points self.x_dim = 1 # x and y dim are fixed for this dataset. self.y_dim = 1 self.amplitude_range = amplitude_range self.data = [] # initialize likelihood and model x = torch.linspace(self.amplitude_range[0], self.amplitude_range[1], num_points).unsqueeze(1) likelihood = gpytorch.likelihoods.GaussianLikelihood() mean_dict = {'constant': ConstantMean(), 'linear': LinearMean(1)} kernel_dict = { 'RBF': RBFKernel(), 'cosine': CosineKernel(), 'linear': LinearKernel(), 'periodic': PeriodicKernel(period_length=0.5), 'LCM': LCMKernel(base_kernels=[CosineKernel()], num_tasks=1), 'polynomial': PolynomialKernel(power=2), 'matern': MaternKernel() } # create a different GP from each possible configuration for mean in self.mean_list: for kernel in self.kernel_list: # evaluate GP on prior distribution with gpytorch.settings.prior_mode(True): model = ExactGPModel(x, None, likelihood, mean_module=mean_dict[mean], kernel_module=kernel_dict[kernel]) gp = model(x) # sample from current configuration for i in range(num_samples // self.num_config + 1): y = gp.sample() self.data.append( (x, y.unsqueeze(1))) #+torch.randn(y.shape)*0))
def __init__(self, train_x, train_y, likelihood, outputscale=10, transform_input_fn=None): super().__init__(train_x, train_y, likelihood) self.mean_module = ZeroMean() self.kernel = ScaleKernel(MaternKernel(nu=2.5)) self.likelihood.noise_covar.noise = 1e-8 self.kernel.outputscale = outputscale self.transform_input_fn = transform_input_fn
def _draw_gp_function(self, X, lengthscale=10.0, kernel_str="RBF"): if kernel_str == "RBF": kernel = RBFKernel() elif kernel_str == "Mat": kernel = MaternKernel(nu=0.5) else: raise Exception("Invalid kernel string: {}".format(kernel_str)) kernel.lengthscale = lengthscale with torch.no_grad(): lazy_cov = kernel(X) mean = torch.zeros(lazy_cov.size(0)) mvn = MultivariateNormal(mean, lazy_cov) Y = mvn.rsample()[:, None] return Y
def test_ard_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 3]], [[2, -1, 2], [2, -1, 0]]], dtype=torch.float) b = torch.tensor([[[1, 4, 3]], [[2, -1, 0]]], dtype=torch.float) lengthscales = torch.tensor([[[1, 2, 1]]], dtype=torch.float) kernel = MaternKernel(nu=2.5, batch_size=2, ard_num_dims=3) kernel.initialize(log_lengthscale=torch.log(lengthscales)) kernel.eval() dist = torch.tensor([[[1], [1]], [[2], [0]]], dtype=torch.float).mul_(math.sqrt(5)) actual = (dist**2 / 3 + dist + 1).mul(torch.exp(-dist)) res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-3)
def __init__(self, train_x, train_y, likelihood,kernel = 'rbf',nu = 2.5): super(GPModel, self).__init__(train_x, train_y, likelihood) grid_size = gpytorch.utils.grid.choose_grid_size(train_x) self.mean_module = gpytorch.means.ConstantMean() if kernel =='rbf': self.covar_module = SKI( ScaleKernel( RBFKernel(ard_num_dims=2) ), grid_size=grid_size, num_dims=2 ) elif kernel == 'matern': self.covar_module = SKI( ScaleKernel( MaternKernel(nu) ), grid_size=grid_size, num_dims=2 )
def test_ard_separate_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 3]], [[2, -1, 2], [2, -1, 0]]], dtype=torch.float) b = torch.tensor([[[1, 4, 3]], [[2, -1, 0]]], dtype=torch.float).repeat(1, 2, 1) lengthscales = torch.tensor([[[1, 2, 1]], [[2, 1, 0.5]]], dtype=torch.float) kernel = MaternKernel(nu=2.5, batch_size=2, ard_num_dims=3) kernel.initialize(log_lengthscale=torch.log(lengthscales)) kernel.eval() dist = torch.tensor([[[1, 1], [1, 1]], [[4, 4], [0, 0]]], dtype=torch.float).mul_(math.sqrt(5)) actual = (dist**2 / 3 + dist + 1).mul(torch.exp(-dist)) res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-3) # diag res = kernel(a, b).diag() actual = torch.cat( [actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims dist = torch.tensor( [ [[0, 0], [1, 1]], [[1, 1], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]], [[4, 4], [0, 0]], ], dtype=torch.float, ) dist.mul_(math.sqrt(5)) actual = (dist**2 / 3 + dist + 1).mul(torch.exp(-dist)) res = kernel(a, b, batch_dims=(0, 2)).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims + diag res = kernel(a, b, batch_dims=(0, 2)).diag() actual = torch.cat( [actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5)
def __init__(self, train_X, train_Y, likelihood, dim, lengthscale_constraint, outputscale_constraint, ard_dims): # squeeze output dim before passing train_Y to ExactGP super().__init__(train_X, train_Y, likelihood) # GaussianLikelihood()) # GaussianLikelihood() noise.squeeze(-1) self.dim = dim self.mean_module = ConstantMean() self.covar_module = ScaleKernel(CylindricalKernel( num_angular_weights=ard_dims, alpha_prior=KumaAlphaPrior(), alpha_constraint=gpytorch.constraints.constraints.Interval(lower_bound=0.5, upper_bound=1.), beta_prior=KumaBetaPrior(), beta_constraint=gpytorch.constraints.constraints.Interval(lower_bound=1., upper_bound=2.), radial_base_kernel=MaternKernel(lengthscale_constraint=lengthscale_constraint, ard_num_dims=1, nu=2.5), # angular_weights_constraint=gpytorch.constraints.constraints.Interval(lower_bound=np.exp(-12.), # upper_bound=np.exp(20.)), angular_weights_prior=AngularWeightsPrior() )) self.to(train_X) # make sure we're on the right device/dtype
def test_ard_separate_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 3]], [[2, -1, 2], [2, -1, 0]]], dtype=torch.float) b = torch.tensor([[[1, 4, 3]], [[2, -1, 0]]], dtype=torch.float).repeat(1, 2, 1) lengthscales = torch.tensor([[[1, 2, 1]], [[2, 1, 0.5]]], dtype=torch.float) kernel = MaternKernel(nu=2.5, batch_shape=torch.Size([2]), ard_num_dims=3) kernel.initialize(lengthscale=lengthscales) kernel.eval() dist = torch.tensor([[[1, 1], [1, 1]], [[4, 4], [0, 0]]], dtype=torch.float).mul_(math.sqrt(5)) actual = (dist**2 / 3 + dist + 1).mul(torch.exp(-dist)) res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-3) # diag res = kernel(a, b).diag() actual = torch.cat( [actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims dist = torch.tensor([ [[[0.0, 0.0], [1.0, 1.0]], [[0.0, 0.0], [0.0, 0.0]]], [[[1.0, 1.0], [0.0, 0.0]], [[0.0, 0.0], [0.0, 0.0]]], [[[0.0, 0.0], [0.0, 0.0]], [[4.0, 4.0], [0.0, 0.0]]], ]) dist.mul_(math.sqrt(5)) dist = dist.view(3, 2, 2, 2).transpose(0, 1) actual = (dist**2 / 3 + dist + 1).mul(torch.exp(-dist)) res = kernel(a, b, last_dim_is_batch=True).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims + diag res = kernel(a, b, last_dim_is_batch=True).diag() actual = actual.diagonal(dim1=-2, dim2=-1) self.assertLess(torch.norm(res - actual), 1e-5)
def __init__(self, input_dims, output_dims, num_inducing=300, inducing_points=None, mean_type="constant", Q=8): if output_dims is None: # An output_dims of None implies there is only one GP in this layer # (e.g., the last layer for univariate regression). inducing_points = torch.randn(num_inducing, input_dims) else: inducing_points = torch.randn(output_dims, num_inducing, input_dims) # Let's use mean field / diagonal covariance structure. variational_distribution = MeanFieldVariationalDistribution( num_inducing_points=num_inducing, batch_shape=torch.Size([output_dims]) if output_dims is not None else torch.Size([]), ) # Standard variational inference. variational_strategy = VariationalStrategy( self, inducing_points, variational_distribution, learn_inducing_locations=True) batch_shape = torch.Size([]) if output_dims is None else torch.Size( [output_dims]) super().__init__(variational_strategy, input_dims, output_dims, Q) if mean_type == "constant": self.mean_module = ConstantMean(batch_shape=batch_shape) elif mean_type == "linear": self.mean_module = LinearMean(input_dims, batch_shape=batch_shape) self.covar_module = ScaleKernel(MaternKernel(batch_shape=batch_shape, ard_num_dims=input_dims), batch_shape=batch_shape, ard_num_dims=None)
def __init__(self, train_x, train_y, likelihood, var=None, latent=None, kernel_params=None, latent_params=None): super(ExactGPModel, self).__init__(train_x, train_y, likelihood) if latent_params is None: latent_params = {'input_dim': train_x.size(-1)} self._set_latent_function(latent, latent_params) self.mean_module = ZeroMean() ard_num_dims = self.latent_func.embed_dim if self.latent_func.embed_dim is not None else train_x.size( -1) kernel = kernel_params['type'] if kernel_params is not None else 'rbf' if kernel is None or kernel == 'rbf': self.kernel_covar_module = ScaleKernel( RBFKernel(ard_num_dims=ard_num_dims)) elif kernel == 'matern': self.kernel_covar_module = ScaleKernel( MaternKernel(nu=1.5, ard_num_dims=ard_num_dims)) # without scale kernel: very poor performance # matern 0.5, 1.5 and 2.5 all have similar performance elif kernel == 'spectral_mixture': self.kernel_covar_module = SpectralMixtureKernel( num_mixtures=kernel_params['n_mixtures'], ard_num_dims=train_x.size(-1)) self.kernel_covar_module.initialize_from_data(train_x, train_y) else: raise NotImplementedError # set covariance module if var is not None: self.noise_covar_module = WhiteNoiseKernel(var) self.covar_module = self.kernel_covar_module + self.noise_covar_module else: self.covar_module = self.kernel_covar_module
def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), lengthscale_constraint=Interval(1e-4, 12.0), ), outputscale_prior=GammaPrior(2.0, 0.15), outputscale_constraint=Interval(1e-4, 12.0), ) else: covar_module = old_model.covar_module if args.dim == 3: wiski_grid_size = 10 elif args.dim == 2: wiski_grid_size = 30 kernel_cache = old_model._kernel_cache if old_model is not None else None model_obj = OnlineSKIBotorchModel( X, Y, train_noise_term=noise, grid_bounds=bounds, grid_size=wiski_grid_size, learn_additional_noise=True, kernel_cache=kernel_cache, covar_module=covar_module, ).to(X) mll = BatchedWoodburyMarginalLogLikelihood( model_obj.likelihood, model_obj, clear_caches_every_iteration=True ) # TODO: reload statedict here? # weird errors resulting return model_obj, mll
def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), lengthscale_constraint=Interval(1e-4, 12.0), ), outputscale_prior=GammaPrior(2.0, 0.15), outputscale_constraint=Interval(1e-4, 12.0), ) if args.fixed_noise: model_obj = FixedNoiseGP( X, Y, train_Yvar=noise, covar_module=covar_module ) else: model_obj = SingleTaskGP(X, Y, covar_module=covar_module) else: model_obj = old_model mll = ExactMarginalLogLikelihood(model_obj.likelihood, model_obj) return model_obj, mll
def __init__(self, inputs, targets, likelihood): # check the hardware self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # store inputs and outputs self.inputs = torch.from_numpy(inputs).float().to(self.device) self.targets = torch.from_numpy(targets).float().to(self.device) # initialise GP and store likelihood ExactGP.__init__(self, self.inputs, self.targets, likelihood) self.likelihood = likelihood # mean and covariance self.mean = ConstantMean() # self.cov = GaussianSymmetrizedKLKernel() self.cov = MaternKernel(ard_num_dims=2) self.cov = ScaleKernel(self.cov, ard_num_dims=2) self.cov = InducingPointKernel(self.cov, self.inputs, self.likelihood) # you better have a GPU! self.likelihood.to(self.device).float() self.to(self.device).float()
def __init__(self, train_x, train_y, likelihood): super(ExactGPModel, self).__init__(train_x, train_y, likelihood) self.mean_module = gpytorch.means.ConstantMean() self.covar_module = ScaleKernel( MaternKernel(5 / 2, ard_num_dims=train_x.shape[1]))
def create_kernel_ard(self, num_dims, **kwargs): return ArcKernel(base_kernel=MaternKernel(nu=0.5), ard_num_dims=num_dims, **kwargs)
def create_kernel_no_ard(self, **kwargs): return ArcKernel(base_kernel=MaternKernel(nu=0.5), **kwargs)
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data(args.data_loc, args.num_init, args.num_total, test_is_year=False) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) model = FixedNoiseOnlineSKIGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), GridInterpolationKernel( base_kernel=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), grid_size=30, num_dims=2, grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]), ), learn_additional_noise=False, ).to(device) mll = BatchedWoodburyMarginalLogLikelihood(model.likelihood, model) print("---- Fitting initial model ----") start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), use_toeplitz(args.toeplitz): fit_gpytorch_torch(mll, options={"lr": 0.1, "maxiter": 1000}) end = time.time() print("Elapsed fitting time: ", end - start) model.zero_grad() model.eval() print("--- Generating initial predictions on test set ----") start = time.time() with detach_test_caches(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): pred_dist = model(test_x) pred_mean = pred_dist.mean.detach() # pred_var = pred_dist.variance.detach() end = time.time() print("Elapsed initial prediction time: ", end - start) rmse_initial = ((pred_mean.view(-1) - test_y.view(-1))**2).mean().sqrt() print("Initial RMSE: ", rmse_initial.item()) optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) mll_time_list = [] rmse_list = [] for i in range(500, train_x.shape[0]): model.zero_grad() model.train() start = time.time() with skip_logdet_forward(True), max_root_decomposition_size( args.sketch_size), max_cholesky_size( args.cholesky_size), use_toeplitz(args.toeplitz): loss = -mll(model(train_x[:i]), train_y[:i]).sum() loss.backward() mll_time = start - time.time() optimizer.step() model.zero_grad() optimizer.zero_grad() start = time.time() with torch.no_grad(): model.condition_on_observations( train_x[i].unsqueeze(0), train_y[i].view(1, 1), train_y_var[i].view(-1, 1), inplace=True, ) fantasy_time = start - time.time() mll_time_list.append([-mll_time, -fantasy_time]) if i % 25 == 0: start = time.time() model.eval() model.zero_grad() with detach_test_caches(), max_root_decomposition_size( args.sketch_size), max_cholesky_size(args.cholesky_size): pred_dist = model(test_x) end = time.time() rmse = (((pred_dist.mean - test_y.view(-1))**2).mean().sqrt().item()) rmse_list.append([rmse, end - start]) print("Current RMSE: ", rmse) print("Outputscale: ", model.covar_module.base_kernel.raw_outputscale) print( "Lengthscale: ", model.covar_module.base_kernel.base_kernel.raw_lengthscale, ) print("Step: ", i, "Train Loss: ", loss) optimizer.param_groups[0]["lr"] *= 0.9 torch.save({ "training": mll_time_list, "predictions": rmse_list }, args.output)
def __init__(self, dim: int, train_X: Tensor, train_Y: Tensor, options: dict, which_type: Optional[str] = "obj") -> None: self.dim = dim if len(train_Y) == 0: # No data case train_X = None train_Y = None else: # Error checking: assert train_Y.dim() == 1, "train_Y is required to be 1D" self._validate_tensor_args( X=train_X, Y=train_Y[:, None] ) # Only for this function, train_Y must be 2D (this must be a bug in botorch) print("\n") logger.info("### Initializing GP model for objective f(x) ###") # Likelihood: noise_std = options.hyperpars.noise_std.value if train_Y is not None: lik = FixedNoiseGaussianLikelihood( noise=torch.full_like(train_Y, noise_std**2)) else: lik = FixedNoiseGaussianLikelihood( noise=torch.tensor([noise_std**2], device=device, dtype=dtype)) # Initialize parent class: super().__init__(train_X, train_Y, lik) # # Obtain hyperprior for lengthscale and outputscale: # # NOTE: The mean (zero) and the model noise are fixed # lengthscale_prior, outputscale_prior = extract_prior(options.hyperpriors) # Initialize hyperpriors using scipy because gpytorch's gamma and beta distributions do not have the inverse CDF hyperpriors = dict( lengthscales=eval(options.hyperpars.lenthscales.prior), outputscale=eval(options.hyperpars.outputscale.prior)) # Index hyperparameters: self.idx_hyperpars = dict(lengthscales=list(range(0, self.dim)), outputscale=[self.dim]) self.dim_hyperpars = sum( [len(val) for val in self.idx_hyperpars.values()]) # Get bounds: self.hyperpars_bounds = self._get_hyperparameters_bounds(hyperpriors) logger.info("hyperpars_bounds:" + str(self.hyperpars_bounds)) # Initialize prior mean: # self.mean_module = ConstantMean() self.mean_module = ZeroMean() # Initialize covariance function: # base_kernel = RBFKernel(ard_num_dims=train_X.shape[-1],lengthscale_prior=GammaPrior(3.0, 6.0)) # original # self.covar_module = ScaleKernel(base_kernel=base_kernel,outputscale_prior=GammaPrior(2.0, 0.15)) # original # base_kernel = RBFKernel(ard_num_dims=self.dim,lengthscale_prior=lengthscale_prior,lengthscale_constraint=GreaterThan(1e-2)) base_kernel = MaternKernel(nu=2.5, ard_num_dims=self.dim, lengthscale=0.1 * torch.ones(self.dim)) self.covar_module = ScaleKernel(base_kernel=base_kernel) self.disp_info_scipy_opti = True # self.method = "L-BFGS-B" self.method = "LN_BOBYQA" # self.method = 'trust-constr' # Get a hyperparameter sample within bounds (not the same as sampling from the corresponding priors): hyperpars_sample = self._sample_hyperparameters_within_bounds( Nsamples=1).squeeze(0) self.covar_module.outputscale = hyperpars_sample[ self.idx_hyperpars["outputscale"]] self.covar_module.base_kernel.lengthscale = hyperpars_sample[ self.idx_hyperpars["lengthscales"]] self.noise_std = options.hyperpars.noise_std.value # The evaluation noise is fixed, and given by the user # Initialize marginal log likelihood for the GPCR model. # mll_objective is callable # MLLGPCR can internally modify the model hyperparameters, and will do so throughout the optimization routine self.mll_objective = MLLGP(model_gp=self, likelihood_gp=self.likelihood, hyperpriors=hyperpriors) # Define nlopt optimizer: self.opti_hyperpars = OptimizationNonLinear( dim=self.dim_hyperpars, fun_obj=self.mll_objective, algo_str=self.method, tol_x=1e-4, Neval_max_local_optis=options.hyperpars.optimization.Nmax_evals, bounds=self.hyperpars_bounds, what2optimize_str="GP hyperparameters") # Make sure we're on the right device/dtype if train_Y is not None: self.to(train_X) self.Nrestarts = options.hyperpars.optimization.Nrestarts self._update_hyperparameters() self.eval()
def create_kernel_no_ard(self, **kwargs): kernel = MaternKernel(nu=0.5, **kwargs) kernel.initialize(lengthscale=5.0) return kernel
def test_random_fourier_features(self): # test kernel that is not Scale, RBF, or Matern with self.assertRaises(NotImplementedError): RandomFourierFeatures( kernel=PeriodicKernel(), input_dim=2, num_rff_features=3, ) # test batched kernel with self.assertRaises(NotImplementedError): RandomFourierFeatures( kernel=RBFKernel(batch_shape=torch.Size([2])), input_dim=2, num_rff_features=3, ) tkwargs = {"device": self.device} for dtype in (torch.float, torch.double): tkwargs["dtype"] = dtype # test init # test ScaleKernel base_kernel = RBFKernel(ard_num_dims=2) kernel = ScaleKernel(base_kernel).to(**tkwargs) rff = RandomFourierFeatures( kernel=kernel, input_dim=2, num_rff_features=3, ) self.assertTrue(torch.equal(rff.outputscale, kernel.outputscale)) # check that rff makes a copy self.assertFalse(rff.outputscale is kernel.outputscale) self.assertTrue( torch.equal(rff.lengthscale, base_kernel.lengthscale)) # check that rff makes a copy self.assertFalse(rff.lengthscale is kernel.lengthscale) for sample_shape in [torch.Size(), torch.Size([5])]: # test not ScaleKernel rff = RandomFourierFeatures( kernel=base_kernel, input_dim=2, num_rff_features=3, sample_shape=sample_shape, ) self.assertTrue( torch.equal(rff.outputscale, torch.tensor(1, **tkwargs))) self.assertTrue( torch.equal(rff.lengthscale, base_kernel.lengthscale)) # check that rff makes a copy self.assertFalse(rff.lengthscale is kernel.lengthscale) self.assertEqual(rff.weights.shape, torch.Size([*sample_shape, 2, 3])) self.assertEqual(rff.bias.shape, torch.Size([*sample_shape, 3])) self.assertTrue( ((rff.bias <= 2 * pi) & (rff.bias >= 0.0)).all()) # test forward for sample_shape in [torch.Size(), torch.Size([7])]: rff = RandomFourierFeatures( kernel=kernel, input_dim=2, num_rff_features=3, sample_shape=sample_shape, ) for input_batch_shape in [torch.Size([]), torch.Size([5])]: X = torch.rand(*input_batch_shape, *sample_shape, 1, 2, **tkwargs) Y = rff(X) self.assertTrue( Y.shape, torch.Size([*input_batch_shape, *sample_shape, 1, 1])) _constant = torch.sqrt(2 * rff.outputscale / rff.weights.shape[-1]) _arg_to_cos = X / base_kernel.lengthscale @ rff.weights _bias_expanded = rff.bias.unsqueeze(-2) expected_Y = _constant * (torch.cos(_arg_to_cos + _bias_expanded)) self.assertTrue(torch.allclose(Y, expected_Y)) # test get_weights for sample_shape in [torch.Size(), torch.Size([5])]: with mock.patch("torch.randn", wraps=torch.randn) as mock_randn: rff._get_weights( base_kernel=base_kernel, input_dim=2, num_rff_features=3, sample_shape=sample_shape, ) mock_randn.assert_called_once_with( *sample_shape, 2, 3, dtype=base_kernel.lengthscale.dtype, device=base_kernel.lengthscale.device, ) # test get_weights with Matern kernel with mock.patch( "torch.randn", wraps=torch.randn) as mock_randn, mock.patch( "torch.distributions.Gamma", wraps=torch.distributions.Gamma) as mock_gamma: base_kernel = MaternKernel(ard_num_dims=2).to(**tkwargs) rff._get_weights( base_kernel=base_kernel, input_dim=2, num_rff_features=3, sample_shape=sample_shape, ) mock_randn.assert_called_once_with( *sample_shape, 2, 3, dtype=base_kernel.lengthscale.dtype, device=base_kernel.lengthscale.device, ) mock_gamma.assert_called_once_with( base_kernel.nu, base_kernel.nu, )
def __init__(self, dim: int, train_x: Tensor, train_yl: Tensor, options): """ train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. train_Yvar: A `batch_shape x n x m` tensor of observed measurement noise. """ # Initialize parent class: super().__init__( ) # This is needed because torch.nn.Module, which is parent of GPyTorchModel, needs it print("\n") logger.info("### Initializing GPCR model for constraint g(x) ###") self.discard_too_close_points = options.discard_too_close_points self.dim = dim assert self.dim == train_x.shape[ 1], "The input dimension must agree with train_x" self.train_x = torch.tensor([], device=device, dtype=dtype, requires_grad=False) self.train_yl = torch.tensor([], device=device, dtype=dtype, requires_grad=False) self.update_XY(train_x, train_yl) # One output # ========== # pdb.set_trace() self._validate_tensor_args(X=self.train_xs, Y=self.train_ys.view(-1, 1)) # validate_input_scaling(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) self._set_dimensions(train_X=self.train_xs, train_Y=self.train_ys.view(-1, 1)) # self.train_xs,_,_ = self._transform_tensor_args(X=self.train_xs, Y=self.train_ys) # # Two outputs # # =========== # # pdb.set_trace() # self._validate_tensor_args(X=self.train_xs, Y=self.train_yl) # # validate_input_scaling(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) # self._set_dimensions(train_X=self.train_xs, train_Y=self.train_yl) # # self.train_xs,_,_ = self._transform_tensor_args(X=self.train_xs, Y=self.train_ys) # Initialize hyperpriors using scipy because gpytorch's gamma and beta distributions do not have the inverse CDF hyperpriors = dict( lengthscales=eval(options.hyperpars.lenthscales.prior), outputscale=eval(options.hyperpars.outputscale.prior), threshold=eval(options.hyperpars.threshold.prior)) # Index hyperparameters: self.idx_hyperpars = dict(lengthscales=list(range(0, self.dim)), outputscale=[self.dim], threshold=[self.dim + 1]) self.dim_hyperpars = sum( [len(val) for val in self.idx_hyperpars.values()]) # Get bounds: self.hyperpars_bounds = self._get_hyperparameters_bounds(hyperpriors) logger.info("hyperpars_bounds:" + str(self.hyperpars_bounds)) # Define meand and covariance modules with dummy hyperparameters self.mean_module = ZeroMean() self.covar_module = ScaleKernel(base_kernel=MaternKernel( nu=2.5, ard_num_dims=self.dim, lengthscale=0.1 * torch.ones(self.dim)), outputscale=10.0) # # If non-zero mean, constant mean is assumed: # if "constant" in dir(self.mean_module): # self.__threshold = self.mean_module.constant # else: # self.__threshold = 0.0 # If non-zero mean, constant mean is assumed: if "constant" in dir(self.mean_module): self.__threshold = self.mean_module.constant self.thres_init = self.mean_module.constant else: self.__threshold = options.hyperpars.threshold.init self.thres_init = options.hyperpars.threshold.init # Get a hyperparameter sample within bounds (not the same as sampling from the corresponding priors): hyperpars_sample = self._sample_hyperparameters_within_bounds( Nsamples=1).squeeze(0) self.covar_module.outputscale = hyperpars_sample[ self.idx_hyperpars["outputscale"]] print("self.covar_module.outputscale:", str(self.covar_module.outputscale)) self.covar_module.base_kernel.lengthscale = hyperpars_sample[ self.idx_hyperpars["lengthscales"]] self.threshold = hyperpars_sample[self.idx_hyperpars["threshold"]] self.noise_std = options.hyperpars.noise_std.value # The evaluation noise is fixed, and given by the user self.gauss_tools = GaussianTools() # Initialize EP self.ep = ExpectationPropagation( prior_mean=self.mean_module(train_x).cpu().detach().numpy(), prior_cov=self.covar_module(train_x).cpu().detach().numpy(), Maxiter=options.ep.maxiter, required_precission=options.ep.prec, verbosity=options.ep.verbo) # Initialize marginal log likelihood for the GPCR model. # mll_objective is callable # MLLGPCR can internally modify the model hyperparameters, and will do so throughout the optimization routine self.mll_objective = MLLGPCR(model_gpcr=self, hyperpriors=hyperpriors) # Define nlopt optimizer: self.opti = OptimizationNonLinear( dim=self.dim_hyperpars, fun_obj=self.mll_objective, algo_str=options.hyperpars.optimization.algo_name, tol_x=1e-3, Neval_max_local_optis=options.hyperpars.optimization.Nmax_evals, bounds=self.hyperpars_bounds, what2optimize_str="GPCR hyperparameters") # Extra parameters: self.top_dist_ambiguous_points = 0.5 * torch.min( self.covar_module.base_kernel.lengthscale).item() self.factor_heteroscedastic_noise = 10**4 # Update hyperparameters: self.Nrestarts_hyperpars = options.hyperpars.optimization.Nrestarts self._update_hyperparameters(Nrestarts=self.Nrestarts_hyperpars) # self.likelihood = FixedNoiseGaussianLikelihood(noise=torch.eye()) self.likelihood = None
def create_kernel_no_ard(self, **kwargs): return CylindricalKernel(5, MaternKernel(nu=2.5), **kwargs)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_modules: Optional[List[Kernel]] = None, num_latent_dims: Optional[List[int]] = None, learn_latent_pars: bool = True, latent_init: str = "default", outcome_transform: Optional[OutcomeTransform] = None, input_transform: Optional[InputTransform] = None, ): r"""A HigherOrderGP model for high-dim output regression. Args: train_X: A `batch_shape x n x d`-dim tensor of training inputs. train_Y: A `batch_shape x n x output_shape`-dim tensor of training targets. likelihood: Gaussian likelihood for the model. covar_modules: List of kernels for each output structure. num_latent_dims: Sizes for the latent dimensions. learn_latent_pars: If true, learn the latent parameters. latent_init: [default or gp] how to initialize the latent parameters. """ if input_transform is not None: input_transform.to(train_X) # infer the dimension of `output_shape`. num_output_dims = train_Y.dim() - train_X.dim() + 1 batch_shape = train_X.shape[:-2] if len(batch_shape) > 1: raise NotImplementedError( "HigherOrderGP currently only supports 1-dim `batch_shape`." ) if outcome_transform is not None: if isinstance(outcome_transform, Standardize) and not isinstance( outcome_transform, FlattenedStandardize ): warnings.warn( "HigherOrderGP does not support the outcome_transform " "`Standardize`! Using `FlattenedStandardize` with `output_shape=" f"{train_Y.shape[- num_output_dims:]} and batch_shape=" f"{batch_shape} instead.", RuntimeWarning, ) outcome_transform = FlattenedStandardize( output_shape=train_Y.shape[-num_output_dims:], batch_shape=batch_shape, ) train_Y, _ = outcome_transform(train_Y) self._aug_batch_shape = batch_shape self._num_dimensions = num_output_dims + 1 self._num_outputs = train_Y.shape[0] if batch_shape else 1 self.target_shape = train_Y.shape[-num_output_dims:] self._input_batch_shape = batch_shape if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True super().__init__( train_X, train_Y.view(*self._aug_batch_shape, -1), likelihood=likelihood, ) if covar_modules is not None: self.covar_modules = ModuleList(covar_modules) else: self.covar_modules = ModuleList( [ MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=self._aug_batch_shape, ard_num_dims=1 if dim > 0 else train_X.shape[-1], ) for dim in range(self._num_dimensions) ] ) if num_latent_dims is None: num_latent_dims = [1] * (self._num_dimensions - 1) self.to(train_X.device) self._initialize_latents( latent_init=latent_init, num_latent_dims=num_latent_dims, learn_latent_pars=learn_latent_pars, device=train_Y.device, dtype=train_Y.dtype, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform
def create_kernel_ard(self, num_dims, **kwargs): kernel = MaternKernel(nu=0.5, ard_num_dims=num_dims, **kwargs) kernel.initialize(lengthscale=5.0) return kernel
def __init__(self, X, y, likelihood, gpu=False, nu=2.5, lengthscale_prior=None, outputscale_prior=None): """ Parameters ---------- X : torch.tensor Training domain values. y : torch.tensor Training response values. likelihood : (gpytorch.likelihoods) Model likelihood. gpu : bool Use GPUs (if available) to run gaussian process computations. nu : float Matern kernel parameter. Options: 0.5, 1.5, 2.5. lengthscale_prior : [gpytorch.priors, init_value] GPyTorch prior object and initial value. Sets a prior over length scales. outputscale_prior : [gpytorch.priors, init_value] GPyTorch prior object and initial value. Sets a prior over output s cales. """ super(gp_model, self).__init__(X, y, likelihood) # ARD num_dims = len(X) if len(X) == 0 else len(X[0]) # Base kernel if lengthscale_prior == None: kernel = MaternKernel(nu=nu, ard_num_dims=num_dims) else: kernel = MaternKernel(nu=nu, ard_num_dims=num_dims, lengthscale_prior=lengthscale_prior[0]) # Mean self.mean_module = ConstantMean() # Output scale if outputscale_prior == None: self.covar_module = ScaleKernel(kernel) else: self.covar_module = ScaleKernel( kernel, outputscale_prior=outputscale_prior[0]) # Set initial values if lengthscale_prior != None: try: ls_init = to_torch(lengthscale_prior[1], gpu=gpu) self.covar_module.base_kernel.lengthscale = ls_init except: uniform = to_torch(lengthscale_prior[1], gpu=gpu) ls_init = torch.ones(num_dims) * uniform self.covar_module.base_kernel.lengthscale = ls_init if outputscale_prior != None: os_init = to_torch(outputscale_prior[1], gpu=gpu) self.covar_module.outputscale = os_init
def create_kernel_no_ard(self, **kwargs): return MaternKernel(nu=1.5, **kwargs)
def create_bl_model(data, y): kernel = ScaleKernel(MaternKernel()) model = ExactGPModel(data, y, GaussianLikelihood(), kernel) return model
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data( args.data_loc, args.num_init, args.num_total, test_is_year=False, seed=args.seed, ) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) if args.model == "wiski": model = FixedNoiseOnlineSKIGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), GridInterpolationKernel( base_kernel=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), grid_size=30, num_dims=2, grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]), ), learn_additional_noise=False, ).to(device) mll_type = lambda x, y: BatchedWoodburyMarginalLogLikelihood( x, y, clear_caches_every_iteration=True) elif args.model == "exact": model = FixedNoiseGP( init_x, init_y.view(-1, 1), init_y_var.view(-1, 1), ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), ).to(device) mll_type = ExactMarginalLogLikelihood mll = mll_type(model.likelihood, model) print("---- Fitting initial model ----") start = time.time() model.train() model.zero_grad() # with max_cholesky_size(args.cholesky_size), skip_logdet_forward(True), \ # use_toeplitz(args.toeplitz), max_root_decomposition_size(args.sketch_size): fit_gpytorch_torch(mll, options={"lr": 0.1, "maxiter": 1000}) end = time.time() print("Elapsed fitting time: ", end - start) print("Named parameters: ", list(model.named_parameters())) print("--- Now computing initial RMSE") model.eval() with gpytorch.settings.skip_posterior_variances(True): test_pred = model(test_x) pred_rmse = ((test_pred.mean - test_y)**2).mean().sqrt() print("---- Initial RMSE: ", pred_rmse.item()) all_outputs = [] start_ind = init_x.shape[0] end_ind = int(start_ind + args.batch_size) for step in range(args.num_steps): if step > 0 and step % 25 == 0: print("Beginning step ", step) total_time_step_start = time.time() if step > 0: print("---- Fitting model ----") start = time.time() model.train() model.zero_grad() mll = mll_type(model.likelihood, model) # with skip_logdet_forward(True), max_root_decomposition_size( # args.sketch_size # ), max_cholesky_size(args.cholesky_size), use_toeplitz( # args.toeplitz # ): fit_gpytorch_torch(mll, options={ "lr": 0.01 * (0.99**step), "maxiter": 300 }) model.zero_grad() end = time.time() print("Elapsed fitting time: ", end - start) print("Named parameters: ", list(model.named_parameters())) if not args.random: if args.model == "wiski": botorch_model = OnlineSKIBotorchModel(model=model) else: botorch_model = model # qmc_sampler = SobolQMCNormalSampler(num_samples=4) bounds = torch.stack([torch.zeros(2), torch.ones(2)]).to(device) qnipv = qNIPV( model=botorch_model, mc_points=test_x, # sampler=qmc_sampler, ) #with use_toeplitz(args.toeplitz), root_pred_var(True), fast_pred_var(True): candidates, acq_value = optimize_acqf( acq_function=qnipv, bounds=bounds, q=args.batch_size, num_restarts=1, raw_samples=10, # used for intialization heuristic options={ "batch_limit": 5, "maxiter": 200 }, ) else: candidates = torch.rand(args.batch_size, train_x.shape[-1], device=device, dtype=train_x.dtype) acq_value = torch.zeros(1) model.eval() _ = model(test_x[:10]) # to init caches print("---- Finished optimizing; now querying dataset ---- ") with torch.no_grad(): covar_dists = model.covar_module(candidates, train_x) nearest_points = covar_dists.evaluate().argmax(dim=-1) new_x = train_x[nearest_points] new_y = train_y[nearest_points] new_y_var = train_y_var[nearest_points] todrop = torch.tensor( [x in nearest_points for x in range(train_x.shape[0])]) train_x, train_y, train_y_var = train_x[~todrop], train_y[ ~todrop], train_y_var[~todrop] print("New train_x shape", train_x.shape) print("--- Now updating model with simulator ----") model = model.condition_on_observations(X=new_x, Y=new_y.view(-1, 1), noise=new_y_var.view( -1, 1)) print("--- Now computing updated RMSE") model.eval() # with gpytorch.settings.fast_pred_var(True), \ # detach_test_caches(True), \ # max_root_decomposition_size(args.sketch_size), \ # max_cholesky_size(args.cholesky_size), \ # use_toeplitz(args.toeplitz), root_pred_var(True): test_pred = model(test_x) pred_rmse = ((test_pred.mean.view(-1) - test_y.view(-1))**2).mean().sqrt() pred_avg_variance = test_pred.variance.mean() total_time_step_elapsed_time = time.time() - total_time_step_start step_output_list = [ total_time_step_elapsed_time, acq_value.item(), pred_rmse.item(), pred_avg_variance.item() ] print("Step RMSE: ", pred_rmse) all_outputs.append(step_output_list) start_ind = end_ind end_ind = int(end_ind + args.batch_size) output_dict = { "model_state_dict": model.cpu().state_dict(), "queried_points": { 'x': model.cpu().train_inputs[0], 'y': model.cpu().train_targets }, "results": DataFrame(all_outputs) } torch.save(output_dict, args.output)
def create_kernel_ard(self, num_dims, **kwargs): return MaternKernel(nu=1.5, ard_num_dims=num_dims, **kwargs)