def __init__(self, train_x, train_y, likelihood): super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood) self.mean_module = MultitaskMean(ConstantMean(), num_tasks=2) self_covar_module = RBFKernel() self.covar_module = MultitaskKernel(self_covar_module, num_tasks=2, rank=2)
def create_kernel_ard(self, num_dims, **kwargs): return NewtonGirardAdditiveKernel(RBFKernel(ard_num_dims=num_dims), num_dims, 2, **kwargs)
def create_kernel_ard(self, num_dims, **kwargs): return RBFKernel(ard_num_dims=num_dims, **kwargs)
def test_solve(self): size = 100 train_x = torch.cat( [ torch.linspace(0, 1, size).unsqueeze(0), torch.linspace(0, 0.5, size).unsqueeze(0), torch.linspace(0, 0.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.5, size).unsqueeze(0), torch.linspace(0, 1, size).unsqueeze(0), torch.linspace(0, 0.5, size).unsqueeze(0), torch.linspace(0, 0.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.5, size).unsqueeze(0), torch.linspace(0, 1, size).unsqueeze(0), ], 0, ).unsqueeze(-1) covar_matrix = RBFKernel()(train_x, train_x).evaluate().view( 2, 2, 3, size, size) piv_chol = pivoted_cholesky.pivoted_cholesky(covar_matrix, 10) woodbury_factor = pivoted_cholesky.woodbury_factor( piv_chol, torch.ones(2, 2, 3, 100)) rhs_vector = torch.randn(2, 2, 3, 100, 5) shifted_covar_matrix = covar_matrix + torch.eye(size) real_solve = torch.cat( [ shifted_covar_matrix[0, 0, 0].inverse().matmul( rhs_vector[0, 0, 0]).unsqueeze(0), shifted_covar_matrix[0, 0, 1].inverse().matmul( rhs_vector[0, 0, 1]).unsqueeze(0), shifted_covar_matrix[0, 0, 2].inverse().matmul( rhs_vector[0, 0, 2]).unsqueeze(0), shifted_covar_matrix[0, 1, 0].inverse().matmul( rhs_vector[0, 1, 0]).unsqueeze(0), shifted_covar_matrix[0, 1, 1].inverse().matmul( rhs_vector[0, 1, 1]).unsqueeze(0), shifted_covar_matrix[0, 1, 2].inverse().matmul( rhs_vector[0, 1, 2]).unsqueeze(0), shifted_covar_matrix[1, 0, 0].inverse().matmul( rhs_vector[1, 0, 0]).unsqueeze(0), shifted_covar_matrix[1, 0, 1].inverse().matmul( rhs_vector[1, 0, 1]).unsqueeze(0), shifted_covar_matrix[1, 0, 2].inverse().matmul( rhs_vector[1, 0, 2]).unsqueeze(0), shifted_covar_matrix[1, 1, 0].inverse().matmul( rhs_vector[1, 1, 0]).unsqueeze(0), shifted_covar_matrix[1, 1, 1].inverse().matmul( rhs_vector[1, 1, 1]).unsqueeze(0), shifted_covar_matrix[1, 1, 2].inverse().matmul( rhs_vector[1, 1, 2]).unsqueeze(0), ], 0, ).view_as(rhs_vector) approx_solve = pivoted_cholesky.woodbury_solve(rhs_vector, piv_chol, woodbury_factor, torch.ones(2, 3, 100)) self.assertTrue(approx_equal(approx_solve, real_solve, 2e-4))
def gpnet_nonconj(args, dataloader, test_x, prior_gp): N = len(dataloader.dataset) x_dim = 1 prior_gp.train() if args.net == 'tangent': kernel = prior_gp.covar_module bnn_prev = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'deep': kernel = prior_gp.covar_module bnn_prev = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'rf': kernel = ScaleKernel(RBFKernel()) kernel_prev = ScaleKernel(RBFKernel()) bnn_prev = RFExpansion(x_dim, args.n_hidden, kernel_prev, mvn=False, fix_ls=args.fix_rf_ls, residual=args.residual) bnn = RFExpansion(x_dim, args.n_hidden, kernel, fix_ls=args.fix_rf_ls, residual=args.residual) bnn_prev.load_state_dict(bnn.state_dict()) else: raise NotImplementedError('Unknown inference net') infer_gpnet_optimizer = optim.Adam(bnn.parameters(), lr=args.learning_rate) hyper_opt_optimizer = optim.Adam(prior_gp.parameters(), lr=args.hyper_rate) x_min, x_max = dataloader.dataset.range n = dataloader.batch_size bnn.train() bnn_prev.train() prior_gp.train() mb = master_bar(range(1, args.n_iters + 1)) for t in mb: beta = args.beta0 * 1. / (1. + args.gamma * math.sqrt(t - 1)) dl_bar = progress_bar(dataloader, parent=mb) for x, y in dl_bar: n = x.size(0) x_star = torch.Tensor(args.measurement_size, x_dim).uniform_(x_min, x_max) xx = torch.cat([x, x_star], 0) # inference net infer_gpnet_optimizer.zero_grad() hyper_opt_optimizer.zero_grad() qff = bnn(xx) qff_mean_prev, K_prox = bnn_prev(xx) qf_mean, qf_var = bnn(x, full_cov=False) # Eq.(8) K_prior = kernel(xx, xx).add_jitter(1e-6) pff = MultivariateNormal(torch.zeros(xx.size(0)), K_prior) f_term = expected_log_prob(prior_gp.likelihood, qf_mean, qf_var, y.squeeze(-1)) f_term = torch.sum( expected_log_prob(prior_gp.likelihood, qf_mean, qf_var, y.squeeze(-1))) f_term *= N / x.size(0) * beta prior_term = -beta * cross_entropy(qff, pff) qff_prev = MultivariateNormal(qff_mean_prev, K_prox) prox_term = -(1 - beta) * cross_entropy(qff, qff_prev) entropy_term = entropy(qff) lower_bound = f_term + prior_term + prox_term + entropy_term loss = -lower_bound / n loss.backward(retain_graph=True) infer_gpnet_optimizer.step() # Hyper-parameter update Kn_prior = K_prior[:n, :n] pf = MultivariateNormal(torch.zeros(n), Kn_prior) Kn_prox = K_prox[:n, :n] qf_prev_mean = qff_mean_prev[:n] qf_prev_var = torch.diagonal(Kn_prox) qf_prev = MultivariateNormal(qf_prev_mean, Kn_prior) hyper_obj = expected_log_prob( prior_gp.likelihood, qf_prev_mean, qf_prev_var, y.squeeze(-1)).sum() - kl_div(qf_prev, pf) hyper_obj = -hyper_obj hyper_obj.backward() hyper_opt_optimizer.step() bnn_prev.load_state_dict(bnn.state_dict()) if args.net == 'rf': kernel_prev.load_state_dict(kernel.state_dict()) if t % 50 == 0: mb.write("Iter {}/{}, kl_obj = {:.4f}, noise = {:.4f}".format( t, args.n_iters, lower_bound.item(), prior_gp.likelihood.noise.item())) test_x = test_x.to(args.device) test_stats = evaluate(bnn, prior_gp.likelihood, test_x, args.net == 'tangent') return test_stats
def create_kernel_ard(self, num_dims, **kwargs): base_kernel = RBFKernel(ard_num_dims=num_dims) kernel = ScaleKernel(base_kernel, **kwargs) return kernel
def foo_kp_toeplitz_gp_marginal_log_likelihood_backward(): x = torch.cat([Variable(torch.linspace(0, 1, 2)).unsqueeze(1)] * 3, 1) y = Variable(torch.randn(2), requires_grad=True) rbf_module = RBFKernel() rbf_module.initialize(log_lengthscale=-2) covar_module = GridInterpolationKernel(rbf_module) covar_module.eval() covar_module.initialize_interpolation_grid(5, [(0, 1), (0, 1), (0, 1)]) kronecker_var = covar_module.forward(x, x) cs = Variable(torch.zeros(3, 5), requires_grad=True) J_lefts = [] C_lefts = [] J_rights = [] C_rights = [] Ts = [] for i in range(3): covar_x = covar_module.forward(x[:, i].unsqueeze(1), x[:, i].unsqueeze(1)) cs.data[i] = covar_x.c.data J_lefts.append(covar_x.J_left) C_lefts.append(covar_x.C_left) J_rights.append(covar_x.J_right) C_rights.append(covar_x.C_right) T = Variable(torch.zeros(len(cs[i].data), len(cs[i].data))) for k in range(len(cs[i].data)): for j in range(len(cs[i].data)): T[k, j] = utils.toeplitz.toeplitz_getitem(cs[i], cs[i], k, j) Ts.append(T) W_left = list_of_indices_and_values_to_sparse(J_lefts, C_lefts, cs) W_right = list_of_indices_and_values_to_sparse(J_rights, C_rights, cs) W_left_dense = Variable(W_left.to_dense()) W_right_dense = Variable(W_right.to_dense()) K = kronecker_product(Ts) WKW = W_left_dense.matmul(K.matmul(W_right_dense.t())) quad_form_actual = y.dot(WKW.inverse().matmul(y)) log_det_actual = _det(WKW).log() actual_nll = -0.5 * (log_det_actual + quad_form_actual + math.log(2 * math.pi) * len(y)) actual_nll.backward() actual_cs_grad = cs.grad.data.clone() actual_y_grad = y.grad.data.clone() y.grad.data.fill_(0) cs.grad.data.fill_(0) kronecker_var = gpytorch.lazy.kroneckerProductLazyVariable( cs, kronecker_var.J_lefts, kronecker_var.C_lefts, kronecker_var.J_rights, kronecker_var.C_rights) gpytorch.functions.num_trace_samples = 100 res = kronecker_var.exact_gp_marginal_log_likelihood(y) res.backward() res_cs_grad = covar_x.cs.grad.data res_y_grad = y.grad.data assert (actual_cs_grad - res_cs_grad).norm() / res_cs_grad.norm() < 0.05 assert (actual_y_grad - res_y_grad).norm() / res_y_grad.norm() < 1e-3 y.grad.data.fill_(0) cs.grad.data.fill_(0) gpytorch.functions.fastest = False res = kronecker_var.exact_gp_marginal_log_likelihood(y) res.backward() res_cs_grad = covar_x.cs.grad.data res_y_grad = y.grad.data assert (actual_cs_grad - res_cs_grad).norm() / res_cs_grad.norm() < 1e-3 assert (actual_y_grad - res_y_grad).norm() / res_y_grad.norm() < 1e-3
def __init__(self, input_dims, output_dims, num_inducing=128, mean_type='constant'): # FOR VARIATIONAL INFERENCE: CREATE INDUCING POINTS DRAWN FROM N(0,1) if output_dims is None: print("num_inducing:", num_inducing) print("input_dims:", input_dims) inducing_points = torch.randn(num_inducing, input_dims) batch_shape = torch.Size([]) else: inducing_points = torch.randn(output_dims, num_inducing, input_dims) batch_shape = torch.Size([output_dims]) # INITALIZE VARIATIONAL DISTRUBUTION # The distrubution used for approximation of true posterior distrubution. # Cholesky has a full mean vector of size num_induxing and a full covariance # matrix of size num_inducing * num_inducing. These are learning during training. variational_distribution = CholeskyVariationalDistribution( num_inducing_points=num_inducing, batch_shape=batch_shape) # INITIALIZE VARIATIONAL STRATEGY # Variational strategy wrapper for variational distrubution above. variational_strategy = VariationalStrategy( self, inducing_points, variational_distribution, learn_inducing_locations=True) # Call the DeepGPLayer of GPyTorch do initalize the real class for DGPs. super(DGPHiddenLayer, self).__init__(variational_strategy, input_dims, output_dims) # INITALIZE MEAN # The mean module to be used. A true Gaussian is often times constant in it's output. if mean_type == 'constant': self.mean_module = ConstantMean( batch_shape=batch_shape ) # batch_shape so it knows the dimensions else: # (if 'linear') self.mean_module = LinearMean(input_dims) # INITIALIZE KERNEL # RBF has no scaling, so wrap it with a ScaleKernel with constant k, that is # kernel = k * kernel_rbf. Can make constraints and priors for parameters as well. # It's probobly a good idea to set a prior since we normalize the data and have a # prior belief about them since we can observe the training data that has a certain appearance. # The question is what to set them to. One might have them free to begin with and note # what lengthscales turn out good and then constrain to them to get faster convergence # for future training. #lengthscale_constraint = gpytorch.constraints.Interval(0.0001, 10.0) # needs to be floats lengthscale_prior = gpytorch.priors.NormalPrior(0.5, 3.0) lengthscale_constraint = None #lengthscale_prior = None self.covar_module = ScaleKernel( RBFKernel( batch_shape= batch_shape, # to set separate lengthscale for each eventuall batch ard_num_dims=input_dims, #active_dims=(0), # set input dims to compute covariance for, tuple of ints corresponding to indices of dimensions lengthscale_constraint=lengthscale_constraint, lengthscale_prior=lengthscale_prior), batch_shape=batch_shape, # for ScaleKernel ard_num_dims=None) # for ScaleKernel
def test_solve(self): size = 100 train_x = torch.cat( [ torch.linspace(0, 1, size).unsqueeze(0), torch.linspace(0, 0.5, size).unsqueeze(0), torch.linspace(0, 0.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.5, size).unsqueeze(0), torch.linspace(0, 1, size).unsqueeze(0), torch.linspace(0, 0.5, size).unsqueeze(0), torch.linspace(0, 0.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.25, size).unsqueeze(0), torch.linspace(0, 1.5, size).unsqueeze(0), torch.linspace(0, 1, size).unsqueeze(0), ], 0, ).unsqueeze(-1) covar_matrix = RBFKernel()(train_x, train_x).evaluate().view( 2, 2, 3, size, size) piv_chol = pivoted_cholesky.pivoted_cholesky(covar_matrix, 10) woodbury_factor, inv_scale, logdet = woodbury.woodbury_factor( piv_chol, piv_chol, torch.ones(2, 2, 3, 100), logdet=True) actual_logdet = torch.stack([ mat.logdet() for mat in (piv_chol @ piv_chol.transpose(-1, -2) + torch.eye(100)).view(-1, 100, 100) ], 0).view(2, 2, 3) self.assertTrue(approx_equal(logdet, actual_logdet, 2e-4)) rhs_vector = torch.randn(2, 2, 3, 100, 5) shifted_covar_matrix = covar_matrix + torch.eye(size) real_solve = torch.cat( [ shifted_covar_matrix[0, 0, 0].inverse().matmul( rhs_vector[0, 0, 0]).unsqueeze(0), shifted_covar_matrix[0, 0, 1].inverse().matmul( rhs_vector[0, 0, 1]).unsqueeze(0), shifted_covar_matrix[0, 0, 2].inverse().matmul( rhs_vector[0, 0, 2]).unsqueeze(0), shifted_covar_matrix[0, 1, 0].inverse().matmul( rhs_vector[0, 1, 0]).unsqueeze(0), shifted_covar_matrix[0, 1, 1].inverse().matmul( rhs_vector[0, 1, 1]).unsqueeze(0), shifted_covar_matrix[0, 1, 2].inverse().matmul( rhs_vector[0, 1, 2]).unsqueeze(0), shifted_covar_matrix[1, 0, 0].inverse().matmul( rhs_vector[1, 0, 0]).unsqueeze(0), shifted_covar_matrix[1, 0, 1].inverse().matmul( rhs_vector[1, 0, 1]).unsqueeze(0), shifted_covar_matrix[1, 0, 2].inverse().matmul( rhs_vector[1, 0, 2]).unsqueeze(0), shifted_covar_matrix[1, 1, 0].inverse().matmul( rhs_vector[1, 1, 0]).unsqueeze(0), shifted_covar_matrix[1, 1, 1].inverse().matmul( rhs_vector[1, 1, 1]).unsqueeze(0), shifted_covar_matrix[1, 1, 2].inverse().matmul( rhs_vector[1, 1, 2]).unsqueeze(0), ], 0, ).view_as(rhs_vector) scaled_inv_diag = (inv_scale / torch.ones(2, 3, 100)).unsqueeze(-1) approx_solve = woodbury.woodbury_solve(rhs_vector, piv_chol * scaled_inv_diag, woodbury_factor, scaled_inv_diag, inv_scale) self.assertTrue(approx_equal(approx_solve, real_solve, 2e-4))
def test_online_train_mll_backprop(self): """This test is intended to test consecutive observe-train-observe-train patterns""" r_lik = GaussianLikelihood() r_kernel = GridInterpolationKernelWithFantasy(RBFKernel(), grid_size=self.grid_size, grid_bounds=[(-4.0, 14.0) ]).double() r_model = RegularExactGP(self.xs, self.labels, r_lik, r_kernel, ZeroMean()) lik = GaussianLikelihood() kernel = GridInterpolationKernelWithFantasy(RBFKernel(), grid_size=self.grid_size, grid_bounds=[(-4.0, 14.0) ]).double() model = OnlineWoodburyGP(self.xs, self.labels, lik, kernel, ZeroMean()) def observe_and_update(r_model, model, lengthscale, noise_var, xs, ys, set_online=False): r_model.covar_module.base_kernel.lengthscale = lengthscale if set_online: model.covar_module.base_kernel.lengthscale = lengthscale r_model.likelihood.noise = noise_var if set_online: model.likelihood.noise = noise_var r_model.eval() r_model(self.new_points) r_model = r_model.get_fantasy_model(xs, ys) r_model.train() r_optim = torch.optim.SGD(r_model.parameters(), self.lr) model.eval() model(self.new_points) model = model.get_online_model(xs, ys) model.train() optim = torch.optim.SGD(model.parameters(), self.lr) with gpytorch.settings.fast_computations( ), gpytorch.settings.max_cholesky_size( 1), gpytorch.settings.skip_logdet_forward(): r_mll = ExactMarginalLogLikelihood(r_model.likelihood, r_model) r_train_output = r_model(r_model.train_inputs[0]) r_mll_val = r_mll(r_train_output, r_model.train_targets) mll = WoodburyExactMarginalLogLikelihood(lik, model) train_output = model(model.train_inputs[0]) mll_val = mll(train_output, model.train_targets) np.testing.assert_allclose(r_mll_val.item(), mll_val.item(), rtol=1e-4) loss = -mll_val loss.backward() r_loss = -r_mll_val r_loss.backward() print( "online ls grad", model.covar_module.base_kernel.raw_lengthscale.grad.item(), ) print( "ski ls grad", r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), ) print("online ls", model.covar_module.base_kernel.lengthscale.item()) print("ski ls", r_model.covar_module.base_kernel.lengthscale.item()) print("online noise grad", model.likelihood.raw_noise.grad.item()) print("ski noise grad", r_model.likelihood.raw_noise.grad.item()) print("online noise", model.likelihood.noise.item()) print("ski noise", r_model.likelihood.noise.item()) # Make sure the gradients are the same np.testing.assert_allclose( model.covar_module.base_kernel.raw_lengthscale.grad.item(), r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.raw_noise.grad.item(), r_model.likelihood.raw_noise.grad.item(), rtol=0.01, atol=0.01, ) r_optim.step() r_optim.zero_grad() optim.step() optim.zero_grad() model.get_updated_hyper_strategy() # Make sure the values are the same np.testing.assert_allclose( model.covar_module.base_kernel.lengthscale.item(), r_model.covar_module.base_kernel.lengthscale.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.noise.item(), r_model.likelihood.noise.item(), rtol=0.01, atol=0.01, ) # Verify the gradients are the same = 0 np.testing.assert_allclose( model.covar_module.base_kernel.raw_lengthscale.grad.item(), r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.raw_noise.grad.item(), r_model.likelihood.raw_noise.grad.item(), ) return r_model, model # dot = make_dot(mll_val, dict(model.named_parameters())) # dot.render('test-mll_graph.gv', view=True) # r_dot = make_dot(r_mll_val, dict(r_model.named_parameters())) # r_dot.render('test-r_mll_graph.gv', view=True) # # self.assertAlmostEqual(mll_val.item(), r_mll_val.item(), places=4) r_model, model = observe_and_update( r_model, model, self.lengthscale, self.noise_var, self.points_sequence[1], self.targets_sequence[1], set_online=True, ) ls = deepcopy(model.covar_module.base_kernel.lengthscale.item()) nv = deepcopy(model.likelihood.noise.item()) r_model, model = observe_and_update(r_model, model, ls, nv, self.points_sequence[2], self.targets_sequence[2]) ls = deepcopy(model.covar_module.base_kernel.lengthscale.item()) nv = deepcopy(model.likelihood.noise.item()) r_model, model = observe_and_update(r_model, model, ls, nv, self.points_sequence[3], self.targets_sequence[3]) ls = deepcopy(model.covar_module.base_kernel.lengthscale.item()) nv = deepcopy(model.likelihood.noise.item()) observe_and_update(r_model, model, ls, nv, self.points_sequence[4], self.targets_sequence[4])
def __init__(self, train_x, train_y, likelihood): super(GPRegressionModel, self).__init__(train_x, train_y, likelihood) self.mean_module = ConstantMean(prior=SmoothedBoxPrior(-1e-5, 1e-5)) self.base_covar_module = ScaleKernel(RBFKernel(lengthscale_prior=SmoothedBoxPrior(exp(-5), exp(6), sigma=0.1))) self.covar_module = GridInterpolationKernel(self.base_covar_module, grid_size=50, num_dims=1)
def update(lengthscale, noise_var, xs, ys): r_lik = GaussianLikelihood() r_kernel = GridInterpolationKernelWithFantasy( RBFKernel(), grid_size=self.grid_size, grid_bounds=[(-4.0, 14.0)]).double() r_model = RegularExactGP(xs, ys, r_lik, r_kernel, ZeroMean()) lik = GaussianLikelihood() kernel = GridInterpolationKernelWithFantasy( RBFKernel(), grid_size=self.grid_size, grid_bounds=[(-4.0, 14.0)]).double() model = OnlineWoodburyGP(xs, ys, lik, kernel, ZeroMean()) r_model.covar_module.base_kernel.lengthscale = lengthscale model.covar_module.base_kernel.lengthscale = lengthscale r_model.likelihood.noise = noise_var model.likelihood.noise = noise_var r_model.train() r_optim = torch.optim.SGD(r_model.parameters(), self.lr) model.train() optim = torch.optim.SGD(model.parameters(), self.lr) with gpytorch.settings.fast_computations( ), gpytorch.settings.max_cholesky_size( 1), gpytorch.settings.skip_logdet_forward(): r_mll = ExactMarginalLogLikelihood(r_model.likelihood, r_model) r_train_output = r_model(r_model.train_inputs[0]) r_mll_val = r_mll(r_train_output, r_model.train_targets) mll = WoodburyExactMarginalLogLikelihood( model.likelihood, model) train_output = model(model.train_inputs[0]) mll_val = mll(train_output, model.train_targets) loss = -mll_val loss.backward() r_loss = -r_mll_val r_loss.backward() print( "online ls grad", model.covar_module.base_kernel.raw_lengthscale.grad.item(), ) print( "ski ls grad", r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), ) print("online ls", model.covar_module.base_kernel.lengthscale.item()) print("ski ls", r_model.covar_module.base_kernel.lengthscale.item()) print("online noise grad", model.likelihood.raw_noise.grad.item()) print("ski noise grad", r_model.likelihood.raw_noise.grad.item()) print("online noise", model.likelihood.noise.item()) print("ski noise", r_model.likelihood.noise.item()) # Make sure the gradients are the same np.testing.assert_allclose( model.covar_module.base_kernel.raw_lengthscale.grad.item(), r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.raw_noise.grad.item(), r_model.likelihood.raw_noise.grad.item(), rtol=0.01, atol=0.01, ) r_optim.step() r_optim.zero_grad() optim.step() optim.zero_grad() model.get_updated_hyper_strategy() # Make sure the values are the same np.testing.assert_allclose( model.covar_module.base_kernel.lengthscale.item(), r_model.covar_module.base_kernel.lengthscale.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.noise.item(), r_model.likelihood.noise.item(), rtol=0.01, atol=0.01, ) # Verify the gradients are the same = 0 np.testing.assert_allclose( model.covar_module.base_kernel.raw_lengthscale.grad.item(), r_model.covar_module.base_kernel.raw_lengthscale.grad.item(), rtol=0.01, atol=0.01, ) np.testing.assert_allclose( model.likelihood.raw_noise.grad.item(), r_model.likelihood.raw_noise.grad.item(), ) return r_model, model
def __init__( self, num_outputs, initial_lengthscale, initial_inducing_points, separate_inducing_points=False, kernel="RBF", ard=None, lengthscale_prior=False, ): n_inducing_points = initial_inducing_points.shape[0] if separate_inducing_points: # Use independent inducing points per output GP initial_inducing_points = initial_inducing_points.repeat(num_outputs, 1, 1) if num_outputs > 1: batch_shape = torch.Size([num_outputs]) else: batch_shape = torch.Size([]) variational_distribution = CholeskyVariationalDistribution( n_inducing_points, batch_shape=batch_shape ) variational_strategy = VariationalStrategy( self, initial_inducing_points, variational_distribution ) if num_outputs > 1: variational_strategy = IndependentMultitaskVariationalStrategy( variational_strategy, num_tasks=num_outputs ) super().__init__(variational_strategy) if lengthscale_prior: lengthscale_prior = SmoothedBoxPrior(math.exp(-1), math.exp(1), sigma=0.1) else: lengthscale_prior = None kwargs = { "ard_num_dims": ard, "batch_shape": batch_shape, "lengthscale_prior": lengthscale_prior, } if kernel == "RBF": kernel = RBFKernel(**kwargs) elif kernel == "Matern12": kernel = MaternKernel(nu=1 / 2, **kwargs) elif kernel == "Matern32": kernel = MaternKernel(nu=3 / 2, **kwargs) elif kernel == "Matern52": kernel = MaternKernel(nu=5 / 2, **kwargs) elif kernel == "RQ": kernel = RQKernel(**kwargs) else: raise ValueError("Specified kernel not known.") kernel.lengthscale = initial_lengthscale * torch.ones_like(kernel.lengthscale) self.mean_module = ConstantMean(batch_shape=batch_shape) self.covar_module = ScaleKernel(kernel, batch_shape=batch_shape)
def __init__(self, train_x, train_y, likelihood): super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood) self.mean_module = MultitaskMean(ConstantMean(), n_tasks=2) self.data_covar_module = GridInterpolationKernel(RBFKernel(), grid_size=100, grid_bounds=[(0, 1)]) self.covar_module = MultitaskKernel(self.data_covar_module, n_tasks=2, rank=1)
def test_initialize_outputscale_batch(self): kernel = ScaleKernel(RBFKernel(), batch_shape=torch.Size([2])) ls_init = torch.tensor([3.14, 4.13]) kernel.initialize(outputscale=ls_init) actual_value = ls_init.view_as(kernel.outputscale) self.assertLess(torch.norm(kernel.outputscale - actual_value), 1e-5)
def _test_inv_quad_logdet(self, inv_quad_rhs=None, logdet=False, improper_logdet=False, add_diag=False): # Set up x = torch.randn(*self.__class__.matrix_shape[:-1], 3) kern = RBFKernel() kern_copy = RBFKernel() mat = kern(x).evaluate() mat_clone = kern_copy(x).evaluate() if inv_quad_rhs is not None: inv_quad_rhs.requires_grad_(True) inv_quad_rhs_clone = inv_quad_rhs.detach().clone().requires_grad_( True) mat_clone_with_diag = mat_clone if add_diag: mat_clone_with_diag = mat_clone_with_diag + torch.eye( mat_clone.size(-1)) if inv_quad_rhs is not None: actual_inv_quad = mat_clone_with_diag.inverse().matmul( inv_quad_rhs_clone).mul(inv_quad_rhs_clone) actual_inv_quad = actual_inv_quad.sum([ -1, -2 ]) if inv_quad_rhs.dim() >= 2 else actual_inv_quad.sum() if logdet: flattened_tensor = mat_clone_with_diag.view( -1, *mat_clone.shape[-2:]) logdets = torch.cat( [mat.logdet().unsqueeze(0) for mat in flattened_tensor]) if mat_clone.dim() > 2: actual_logdet = logdets.view(*mat_clone.shape[:-2]) else: actual_logdet = logdets.squeeze() # Compute values with LazyTensor _wrapped_cg = MagicMock(wraps=gpytorch.utils.linear_cg) with gpytorch.settings.num_trace_samples( 2000 ), gpytorch.settings.max_cholesky_size( 0 ), gpytorch.settings.cg_tolerance( 1e-5 ), gpytorch.settings.skip_logdet_forward(improper_logdet), patch( "gpytorch.utils.linear_cg", new=_wrapped_cg ) as linear_cg_mock, gpytorch.settings.min_preconditioning_size( 0), gpytorch.settings.max_preconditioner_size(30): lazy_tensor = NonLazyTensor(mat) if add_diag: lazy_tensor = lazy_tensor.add_jitter(1.0) res_inv_quad, res_logdet = lazy_tensor.inv_quad_logdet( inv_quad_rhs=inv_quad_rhs, logdet=logdet) # Compare forward pass if inv_quad_rhs is not None: self.assertAllClose(res_inv_quad, actual_inv_quad, rtol=1e-2) if logdet and not improper_logdet: self.assertAllClose(res_logdet, actual_logdet, rtol=1e-1, atol=2e-1) # Backward if inv_quad_rhs is not None: actual_inv_quad.sum().backward(retain_graph=True) res_inv_quad.sum().backward(retain_graph=True) if logdet: actual_logdet.sum().backward() res_logdet.sum().backward() self.assertAllClose(kern.raw_lengthscale.grad, kern_copy.raw_lengthscale.grad, rtol=1e-2, atol=1e-2) if inv_quad_rhs is not None: self.assertAllClose(inv_quad_rhs.grad, inv_quad_rhs_clone.grad, rtol=2e-2, atol=1e-2) # Make sure CG was called self.assertTrue(linear_cg_mock.called)
def create_kernel_no_ard(self, **kwargs): base_kernel = RBFKernel() kernel = ScaleKernel(base_kernel, **kwargs) return kernel
def build(self): """ Right now this isn't need by this method """ def prod(iterable): return reduce(operator.mul, iterable) mass_kernel = RBFKernel(active_dims=1, lengthscale_constraint=GreaterThan(10.)) time_kernel = RBFKernel(active_dims=0, lengthscale_constraint=GreaterThan(0.1)) spin_kernels = [ RBFKernel(active_dims=dimension, lengthscale_constraint=GreaterThan(7)) for dimension in range(2, 8) ] class ExactGPModel(gpytorch.models.ExactGP): """ Use the GpyTorch Exact GP """ def __init__(self, train_x, train_y, likelihood): """Initialise the model""" super(ExactGPModel, self).__init__(train_x, train_y, likelihood) self.mean_module = gpytorch.means.ZeroMean() self.covar_module = gpytorch.kernels.ScaleKernel( time_kernel * mass_kernel * prod(spin_kernels), lengthscale_constraint=gpytorch.constraints.LessThan(0.01)) def forward(self, x): """Run the forward method of the model""" mean_x = self.mean_module(x) covar_x = self.covar_module(x) return gpytorch.distributions.MultivariateNormal( mean_x, covar_x) data = np.genfromtxt( pkg_resources.resource_filename('heron', 'models/data/gt-M60-F1024.dat')) training_x = self.training_x = torch.tensor(data[:, 0:-2] * 100).float().cuda() training_y = self.training_y = torch.tensor(data[:, -2] * 1e21).float().cuda() training_yx = torch.tensor(data[:, -1] * 1e21).float().cuda() likelihood = gpytorch.likelihoods.GaussianLikelihood( noise_constraint=LessThan(10)) model = ExactGPModel(training_x, training_y, likelihood) model2 = ExactGPModel(training_x, training_yx, likelihood) state_vector = pkg_resources.resource_filename( 'heron', 'models/data/gt-gpytorch.pth') model = model.cuda() model2 = model2.cuda() likelihood = likelihood.cuda() model.load_state_dict(torch.load(state_vector)) model2.load_state_dict(torch.load(state_vector)) return [model, model2], likelihood
def test_trace_logdet_quad_form_factory(): x = Variable(torch.linspace(0, 1, 10)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.eval() covar_module.initialize_interpolation_grid(4, [(0, 1)]) c = Variable(covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)).c.data, requires_grad=True) T = Variable(torch.zeros(4, 4)) for i in range(4): for j in range(4): T[i, j] = utils.toeplitz.toeplitz_getitem(c, c, i, j) U = torch.randn(4, 4).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) mu_diff = Variable(torch.randn(4), requires_grad=True) actual = _det(T).log() + mu_diff.dot( T.inverse().mv(mu_diff)) + T.inverse().mm(U.t().mm(U)).trace() actual.backward() actual_c_grad = c.grad.data.clone() actual_mu_diff_grad = mu_diff.grad.data.clone() actual_U_grad = U.grad.data.clone() c.grad.data.fill_(0) mu_diff.grad.data.fill_(0) U.grad.data.fill_(0) def _matmul_closure_factory(*args): c, = args return lambda mat2: sym_toeplitz_matmul(c, mat2) def _derivative_quadratic_form_factory(*args): return lambda left_vector, right_vector: ( sym_toeplitz_derivative_quadratic_form(left_vector, right_vector ), ) covar_args = (c, ) gpytorch.functions.num_trace_samples = 1000 res = trace_logdet_quad_form_factory(_matmul_closure_factory, _derivative_quadratic_form_factory)()( mu_diff, U, *covar_args) res.backward() res_c_grad = c.grad.data res_mu_diff_grad = mu_diff.grad.data res_U_grad = U.grad.data assert (res.data - actual.data).norm() / actual.data.norm() < 0.15 assert (res_c_grad - actual_c_grad).norm() / actual_c_grad.norm() < 0.15 assert (res_mu_diff_grad - actual_mu_diff_grad).norm() / actual_mu_diff_grad.norm() < 1e-3 assert (res_U_grad - actual_U_grad).norm() / actual_U_grad.norm() < 1e-3 c.grad.data.fill_(0) mu_diff.grad.data.fill_(0) U.grad.data.fill_(0) covar_args = (c, ) gpytorch.functions.fastest = False res = trace_logdet_quad_form_factory(_matmul_closure_factory, _derivative_quadratic_form_factory)()( mu_diff, U, *covar_args) res.backward() res_c_grad = c.grad.data res_mu_diff_grad = mu_diff.grad.data res_U_grad = U.grad.data assert (res.data - actual.data).norm() / actual.data.norm() < 1e-3 assert (res_c_grad - actual_c_grad).norm() / actual_c_grad.norm() < 1e-3 assert (res_mu_diff_grad - actual_mu_diff_grad).norm() / actual_mu_diff_grad.norm() < 1e-3 assert (res_U_grad - actual_U_grad).norm() / actual_U_grad.norm() < 1e-3
def __init__(self): super(GPRegressionModel, self).__init__(grid_size=20, grid_bounds=[(-0.05, 1.05)]) self.mean_module = ConstantMean(prior=SmoothedBoxPrior(-10, 10)) self.covar_module = ScaleKernel( RBFKernel(log_lengthscale_prior=SmoothedBoxPrior(exp(-3), exp(6), sigma=0.1, log_transform=True)) )
def kernel_fun(rbf_var, rbf_lengthscale, lin_var): return (gpytorch.kernels.ScaleKernel( RBFKernel(lengthscale=torch.tensor(rbf_lengthscale)), outputscale=torch.tensor(rbf_var)) + ScaleKernel(LinearKernel(), outputscale=torch.tensor(lin_var)))
def __init__(self, train_inputs, train_targets, likelihood): super(ExactGPModel, self).__init__(train_inputs, train_targets, likelihood) self.mean_module = ConstantMean(constant_bounds=(-1, 1)) self.covar_module = RBFKernel(log_lengthscale_bounds=(-3, 3))
def gpnet(args, dataloader, test_x, prior_gp): N = len(dataloader.dataset) x_dim = 1 prior_gp.train() if args.net == 'tangent': kernel = prior_gp.covar_module bnn_prev = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'deep': kernel = prior_gp.covar_module bnn_prev = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'rf': kernel = ScaleKernel(RBFKernel()) kernel_prev = ScaleKernel(RBFKernel()) bnn_prev = RFExpansion(x_dim, args.n_hidden, kernel_prev, mvn=False, fix_ls=args.fix_rf_ls, residual=args.residual) bnn = RFExpansion(x_dim, args.n_hidden, kernel, fix_ls=args.fix_rf_ls, residual=args.residual) bnn_prev.load_state_dict(bnn.state_dict()) else: raise NotImplementedError('Unknown inference net') bnn = bnn.to(args.device) bnn_prev = bnn_prev.to(args.device) prior_gp = prior_gp.to(args.device) infer_gpnet_optimizer = optim.Adam(bnn.parameters(), lr=args.learning_rate) hyper_opt_optimizer = optim.Adam(prior_gp.parameters(), lr=args.hyper_rate) x_min, x_max = dataloader.dataset.range bnn.train() bnn_prev.train() prior_gp.train() mb = master_bar(range(1, args.n_iters + 1)) for t in mb: # Hyperparameter selection beta = args.beta0 * 1. / (1. + args.gamma * math.sqrt(t - 1)) dl_bar = progress_bar(dataloader, parent=mb) for x, y in dl_bar: observed_size = x.size(0) x, y = x.to(args.device), y.to(args.device) x_star = torch.Tensor(args.measurement_size, x_dim).uniform_(x_min, x_max).to(args.device) # [Batch + Measurement Points x x_dims] xx = torch.cat([x, x_star], 0) infer_gpnet_optimizer.zero_grad() hyper_opt_optimizer.zero_grad() # inference net # Eq.(6) Prior p(f) # \mu_1=0, \Sigma_1 mean_prior = torch.zeros(observed_size).to(args.device) K_prior = kernel(xx, xx).add_jitter(1e-6) # q_{\gamma_t}(f_M, f_n) = Normal(mu_2, sigma_2|x_n, x_m) # \mu_2, \Sigma_2 qff_mean_prev, K_prox = bnn_prev(xx) # Eq.(8) adapt prior; p(f)^\beta x q(f)^{1 - \beta} mean_adapt, K_adapt = product_gaussians(mu1=mean_prior, sigma1=K_prior, mu2=qff_mean_prev, sigma2=K_prox, beta=beta) # Eq.(8) (mean_n, mean_m), (Knn, Knm, Kmm) = split_gaussian(mean_adapt, K_adapt, observed_size) # Eq.(2) K_{D,D} + noise / (N\beta_t) Ky = Knn + torch.eye(observed_size).to( args.device) * prior_gp.likelihood.noise / (N / observed_size * beta) Ky_tril = torch.cholesky(Ky) # Eq.(2) mean_target = Knm.t().mm(cholesky_solve(y - mean_n, Ky_tril)) + mean_m mean_target = mean_target.squeeze(-1) K_target = gpytorch.add_jitter( Kmm - Knm.t().mm(cholesky_solve(Knm, Ky_tril)), 1e-6) # \hat{q}_{t+1} (f_M) target_pf_star = MultivariateNormal(mean_target, K_target) # q_\gamma (f_M) qf_star = bnn(x_star) # Eq. (11) kl_obj = kl_div(qf_star, target_pf_star).sum() kl_obj.backward(retain_graph=True) infer_gpnet_optimizer.step() # Hyper paramter update (mean_n_prior, _), (Kn_prior, _, _) = split_gaussian(mean_prior, K_prior, observed_size) pf = MultivariateNormal(mean_n_prior, Kn_prior) (qf_prev_mean, _), (Kn_prox, _, _) = split_gaussian(qff_mean_prev, K_prox, observed_size) qf_prev = MultivariateNormal(qf_prev_mean, Kn_prox) hyper_obj = -(prior_gp.likelihood.expected_log_prob( y.squeeze(-1), qf_prev) - kl_div(qf_prev, pf)) hyper_obj.backward(retain_graph=True) hyper_opt_optimizer.step() mb.child.comment = "kl_obj = {:.3f}, obs_var={:.3f}".format( kl_obj.item(), prior_gp.likelihood.noise.item()) # update q_{\gamma_t} to q_{\gamma_{t+1}} bnn_prev.load_state_dict(bnn.state_dict()) if args.net == 'rf': kernel_prev.load_state_dict(kernel.state_dict()) if t % 50 == 0: mb.write("Iter {}/{}, kl_obj = {:.4f}, noise = {:.4f}".format( t, args.n_iters, kl_obj.item(), prior_gp.likelihood.noise.item())) test_x = test_x.to(args.device) test_stats = evaluate(bnn, prior_gp.likelihood, test_x, args.net == 'tangent') return test_stats
def test_random_fourier_features(self): # test kernel that is not Scale, RBF, or Matern with self.assertRaises(NotImplementedError): RandomFourierFeatures( kernel=PeriodicKernel(), input_dim=2, num_rff_features=3, ) # test batched kernel with self.assertRaises(NotImplementedError): RandomFourierFeatures( kernel=RBFKernel(batch_shape=torch.Size([2])), input_dim=2, num_rff_features=3, ) tkwargs = {"device": self.device} for dtype in (torch.float, torch.double): tkwargs["dtype"] = dtype # test init # test ScaleKernel base_kernel = RBFKernel(ard_num_dims=2) kernel = ScaleKernel(base_kernel).to(**tkwargs) rff = RandomFourierFeatures( kernel=kernel, input_dim=2, num_rff_features=3, ) self.assertTrue(torch.equal(rff.outputscale, kernel.outputscale)) # check that rff makes a copy self.assertFalse(rff.outputscale is kernel.outputscale) self.assertTrue( torch.equal(rff.lengthscale, base_kernel.lengthscale)) # check that rff makes a copy self.assertFalse(rff.lengthscale is kernel.lengthscale) # test not ScaleKernel rff = RandomFourierFeatures( kernel=base_kernel, input_dim=2, num_rff_features=3, ) self.assertTrue( torch.equal(rff.outputscale, torch.tensor(1, **tkwargs))) self.assertTrue( torch.equal(rff.lengthscale, base_kernel.lengthscale)) # check that rff makes a copy self.assertFalse(rff.lengthscale is kernel.lengthscale) self.assertEqual(rff.weights.shape, torch.Size([2, 3])) self.assertEqual(rff.bias.shape, torch.Size([3])) self.assertTrue(((rff.bias <= 2 * pi) & (rff.bias >= 0.0)).all()) # test forward rff = RandomFourierFeatures( kernel=kernel, input_dim=2, num_rff_features=3, ) for batch_shape in (torch.Size([]), torch.Size([3])): X = torch.rand(*batch_shape, 1, 2, **tkwargs) Y = rff(X) self.assertTrue(Y.shape, torch.Size([*batch_shape, 1, 1])) expected_Y = torch.sqrt( 2 * rff.outputscale / rff.weights.shape[-1]) * (torch.cos( X / base_kernel.lengthscale @ rff.weights + rff.bias)) self.assertTrue(torch.equal(Y, expected_Y)) # test get_weights with mock.patch("torch.randn", wraps=torch.randn) as mock_randn: rff._get_weights(base_kernel=base_kernel, input_dim=2, num_rff_features=3) mock_randn.assert_called_once_with( 2, 3, dtype=base_kernel.lengthscale.dtype, device=base_kernel.lengthscale.device, ) # test get_weights with Matern kernel with mock.patch("torch.randn", wraps=torch.randn) as mock_randn, mock.patch( "torch.distributions.Gamma", wraps=torch.distributions.Gamma) as mock_gamma: base_kernel = MaternKernel(ard_num_dims=2).to(**tkwargs) rff._get_weights(base_kernel=base_kernel, input_dim=2, num_rff_features=3) mock_randn.assert_called_once_with( 2, 3, dtype=base_kernel.lengthscale.dtype, device=base_kernel.lengthscale.device, ) mock_gamma.assert_called_once_with( base_kernel.nu, base_kernel.nu, )
def create_kernel_no_ard(self, **kwargs): return NewtonGirardAdditiveKernel(RBFKernel(), 4, 2, **kwargs)
def __init__(self, train_x, train_y, likelihood): super(GPRegressionModel, self).__init__(train_x, train_y, likelihood) self.mean_module = ConstantMean() self.base_covar_module = ScaleKernel(RBFKernel()) self.covar_module = InducingPointKernel(self.base_covar_module, inducing_points=train_x[:500, :], likelihood=likelihood)
def create_kernel_no_ard(self, **kwargs): return RBFKernel(**kwargs)
def test_initialize_outputscale(self): kernel = ScaleKernel(RBFKernel()) kernel.initialize(outputscale=3.14) actual_value = torch.tensor(3.14).view_as(kernel.outputscale) self.assertLess(torch.norm(kernel.outputscale - actual_value), 1e-5)
def test_initialize_lengthscale(self): kernel = RBFKernel() kernel.initialize(lengthscale=3.14) actual_value = torch.tensor(3.14).view_as(kernel.lengthscale) self.assertLess(torch.norm(kernel.lengthscale - actual_value), 1e-5)
def __init__(self, train_x, train_y, likelihood): super(GPRegressionModel, self).__init__(train_x, train_y, likelihood) self.mean_module = ConstantMean(constant_bounds=(-1, 1)) self.base_covar_module = RBFKernel(log_lengthscale_bounds=(-3, 3)) self.covar_module = GridInterpolationKernel(self.base_covar_module, grid_size=64, grid_bounds=[(0, 1), (0, 1)])