def forward(self, x): """Compute the resulting batch-distribution.""" return MultivariateNormal(self.mean(x), self.kernel(x))
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, **kwargs: Any, ) -> GPyTorchPosterior: self.eval() # make sure we're calling a posterior no_pred_variance = skip_posterior_variances._state with ExitStack() as es: es.enter_context(gpt_posterior_settings()) es.enter_context(fast_pred_var(True)) # we need to skip posterior variances here es.enter_context(skip_posterior_variances(True)) mvn = self(X) if observation_noise is not False: # TODO: implement Kronecker + diagonal solves so that this is possible. # if torch.is_tensor(observation_noise): # # TODO: Validate noise shape # # make observation_noise `batch_shape x q x n` # obs_noise = observation_noise.transpose(-1, -2) # mvn = self.likelihood(mvn, X, noise=obs_noise) # elif isinstance(self.likelihood, FixedNoiseGaussianLikelihood): # noise = self.likelihood.noise.mean().expand(X.shape[:-1]) # mvn = self.likelihood(mvn, X, noise=noise) # else: mvn = self.likelihood(mvn, X) # lazy covariance matrix includes the interpolated version of the full # covariance matrix so we can actually grab that instead. if X.ndimension() > self.train_inputs[0].ndimension(): X_batch_shape = X.shape[:-2] train_inputs = self.train_inputs[0].reshape( *[1] * len(X_batch_shape), *self.train_inputs[0].shape) train_inputs = train_inputs.repeat( *X_batch_shape, *[1] * self.train_inputs[0].ndimension()) else: train_inputs = self.train_inputs[0] full_covar = self.covar_modules[0](torch.cat((train_inputs, X), dim=-2)) if no_pred_variance: pred_variance = mvn.variance else: # we detach all of the latent dimension posteriors which precludes # computing quantities computed on the posterior wrt latents as # this reduces the memory overhead somewhat # TODO: add these back in if necessary joint_covar = self._get_joint_covariance([X]) pred_variance = self.make_posterior_variances(joint_covar) full_covar = KroneckerProductLazyTensor( full_covar, *[x.detach() for x in joint_covar.lazy_tensors[1:]]) joint_covar_list = [self.covar_modules[0](X, train_inputs)] batch_shape = joint_covar_list[0].batch_shape for cm, param in zip(self.covar_modules[1:], self.latent_parameters): covar = cm(param).detach() if covar.batch_shape != batch_shape: covar = BatchRepeatLazyTensor(covar, batch_shape) joint_covar_list.append(covar) test_train_covar = KroneckerProductLazyTensor(*joint_covar_list) # mean and variance get reshaped into the target shape new_mean = mvn.mean.reshape(*X.shape[:-1], *self.target_shape) if not no_pred_variance: new_variance = pred_variance.reshape(*X.shape[:-1], *self.target_shape) new_variance = DiagLazyTensor(new_variance) else: new_variance = ZeroLazyTensor(*X.shape[:-1], *self.target_shape, self.target_shape[-1]) mvn = MultivariateNormal(new_mean, new_variance) train_train_covar = self.prediction_strategy.lik_train_train_covar.detach( ) # return a specialized Posterior to allow for sampling # cloning the full covar allows backpropagation through it posterior = HigherOrderGPPosterior( mvn=mvn, train_targets=self.train_targets.unsqueeze(-1), train_train_covar=train_train_covar, test_train_covar=test_train_covar, joint_covariance_matrix=full_covar.clone(), output_shape=Size(( *X.shape[:-1], *self.target_shape, )), num_outputs=self._num_outputs, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior( posterior) return posterior
def test_expected_improvement_batch(self): for dtype in (torch.float, torch.double): mean = torch.tensor([-0.5, 0.0, 0.5], device=self.device, dtype=dtype).view(3, 1, 1) variance = torch.ones(3, 1, 1, device=self.device, dtype=dtype) mm = MockModel(MockPosterior(mean=mean, variance=variance)) module = ExpectedImprovement(model=mm, best_f=0.0) X = torch.empty(3, 1, 1, device=self.device, dtype=dtype) # dummy ei = module(X) ei_expected = torch.tensor([0.19780, 0.39894, 0.69780], device=self.device, dtype=dtype) self.assertTrue(torch.allclose(ei, ei_expected, atol=1e-4)) # check for proper error if multi-output model mean2 = torch.rand(3, 1, 2, device=self.device, dtype=dtype) variance2 = torch.rand(3, 1, 2, device=self.device, dtype=dtype) mm2 = MockModel(MockPosterior(mean=mean2, variance=variance2)) with self.assertRaises(UnsupportedError): ExpectedImprovement(model=mm2, best_f=0.0) # test objective (single-output) mean = torch.tensor([[[0.5]], [[0.25]]], device=self.device, dtype=dtype) covar = torch.tensor([[[[0.16]]], [[[0.125]]]], device=self.device, dtype=dtype) mvn = MultivariateNormal(mean, covar) p = GPyTorchPosterior(mvn) mm = MockModel(p) weights = torch.tensor([0.5], device=self.device, dtype=dtype) obj = ScalarizedObjective(weights) ei = ExpectedImprovement(model=mm, best_f=0.0, objective=obj) X = torch.rand(2, 1, 2, device=self.device, dtype=dtype) ei_expected = torch.tensor([[0.2601], [0.1500]], device=self.device, dtype=dtype) torch.allclose(ei(X), ei_expected, atol=1e-4) # test objective (multi-output) mean = torch.tensor([[[-0.25, 0.5]], [[0.2, -0.1]]], device=self.device, dtype=dtype) covar = torch.tensor( [[[0.5, 0.125], [0.125, 0.5]], [[0.25, -0.1], [-0.1, 0.25]]], device=self.device, dtype=dtype, ) mvn = MultitaskMultivariateNormal(mean, covar) p = GPyTorchPosterior(mvn) mm = MockModel(p) weights = torch.tensor([2.0, 1.0], device=self.device, dtype=dtype) obj = ScalarizedObjective(weights) ei = ExpectedImprovement(model=mm, best_f=0.0, objective=obj) X = torch.rand(2, 1, 2, device=self.device, dtype=dtype) ei_expected = torch.tensor([0.6910, 0.5371], device=self.device, dtype=dtype) torch.allclose(ei(X), ei_expected, atol=1e-4) # test bad objective class with self.assertRaises(UnsupportedError): ExpectedImprovement(model=mm, best_f=0.0, objective=IdentityMCObjective())
def _get_test_posterior_batched(device, dtype=torch.float): mean = torch.zeros(3, 2, device=device, dtype=dtype) cov = torch.eye(2, device=device, dtype=dtype).repeat(3, 1, 1) mvn = MultivariateNormal(mean, cov) return GPyTorchPosterior(mvn)
def forward(self, x): return MultivariateNormal(self.mean_module(x), self.covar_module(x))
def scalarize_posterior(posterior: GPyTorchPosterior, weights: Tensor, offset: float = 0.0) -> GPyTorchPosterior: r"""Affine transformation of a multi-output posterior. Args: posterior: The posterior over `m` outcomes to be scalarized. Supports `t`-batching. weights: A tensor of weights of size `m`. offset: The offset of the affine transformation. Returns: The transformed (single-output) posterior. If the input posterior has mean `mu` and covariance matrix `Sigma`, this posterior has mean `weights^T * mu` and variance `weights^T Sigma w`. Example: Example for a model with two outcomes: >>> X = torch.rand(1, 2) >>> posterior = model.posterior(X) >>> weights = torch.tensor([0.5, 0.25]) >>> new_posterior = scalarize_posterior(posterior, weights=weights) """ if weights.ndim > 1: raise BotorchTensorDimensionError("`weights` must be one-dimensional") mean = posterior.mean q, m = mean.shape[-2:] batch_shape = mean.shape[:-2] if m != weights.size(0): raise RuntimeError("Output shape not equal to that of weights") mvn = posterior.mvn cov = mvn.lazy_covariance_matrix if mvn.islazy else mvn.covariance_matrix if m == 1: # just scaling, no scalarization necessary new_mean = offset + (weights[0] * mean).view(*batch_shape, q) new_cov = weights[0]**2 * cov new_mvn = MultivariateNormal(new_mean, new_cov) return GPyTorchPosterior(new_mvn) new_mean = offset + (mean @ weights).view(*batch_shape, q) if q == 1: new_cov = weights.unsqueeze(-2) @ (cov @ weights.unsqueeze(-1)) else: # we need to handle potentially different representations of the multi-task mvn if mvn._interleaved: w_cov = weights.repeat(q).unsqueeze(0) sum_shape = batch_shape + torch.Size([q, m, q, m]) sum_dims = (-1, -2) else: # special-case the independent setting if isinstance(cov, BlockDiagLazyTensor): new_cov = SumLazyTensor(*[ cov.base_lazy_tensor[..., i, :, :] * weights[i].pow(2) for i in range(cov.base_lazy_tensor.size(-3)) ]) new_mvn = MultivariateNormal(new_mean, new_cov) return GPyTorchPosterior(new_mvn) w_cov = torch.repeat_interleave(weights, q).unsqueeze(0) sum_shape = batch_shape + torch.Size([m, q, m, q]) sum_dims = (-2, -3) cov_scaled = w_cov * cov * w_cov.transpose(-1, -2) # TODO: Do not instantiate full covariance for lazy tensors (ideally we simplify # this in GPyTorch: https://github.com/cornellius-gp/gpytorch/issues/1055) if isinstance(cov_scaled, LazyTensor): cov_scaled = cov_scaled.evaluate() new_cov = cov_scaled.view(sum_shape).sum(dim=sum_dims[0]).sum( dim=sum_dims[1]) new_mvn = MultivariateNormal(new_mean, new_cov) return GPyTorchPosterior(new_mvn)
def test_construct_base_samples_from_posterior(self, cuda=False): device = torch.device("cuda") if cuda else torch.device("cpu") for dtype in (torch.float, torch.double): # single-output mean = torch.zeros(2, device=device, dtype=dtype) cov = torch.eye(2, device=device, dtype=dtype) mvn = MultivariateNormal(mean=mean, covariance_matrix=cov) posterior = GPyTorchPosterior(mvn=mvn) for sample_shape in (torch.Size([5]), torch.Size([5, 3])): for qmc in (False, True): for seed in (None, 1234): expected_shape = sample_shape + torch.Size([2, 1]) samples = construct_base_samples_from_posterior( posterior=posterior, sample_shape=sample_shape, qmc=qmc, seed=seed, ) self.assertEqual(samples.shape, expected_shape) self.assertEqual(samples.device.type, device.type) self.assertEqual(samples.dtype, dtype) # single-output, batch mode mean = torch.zeros(2, 2, device=device, dtype=dtype) cov = torch.eye(2, device=device, dtype=dtype).expand(2, 2, 2) mvn = MultivariateNormal(mean=mean, covariance_matrix=cov) posterior = GPyTorchPosterior(mvn=mvn) for sample_shape in (torch.Size([5]), torch.Size([5, 3])): for qmc in (False, True): for seed in (None, 1234): for collapse_batch_dims in (False, True): if collapse_batch_dims: expected_shape = sample_shape + torch.Size([1, 2, 1]) else: expected_shape = sample_shape + torch.Size([2, 2, 1]) samples = construct_base_samples_from_posterior( posterior=posterior, sample_shape=sample_shape, qmc=qmc, collapse_batch_dims=collapse_batch_dims, seed=seed, ) self.assertEqual(samples.shape, expected_shape) self.assertEqual(samples.device.type, device.type) self.assertEqual(samples.dtype, dtype) # multi-output mean = torch.zeros(2, 2, device=device, dtype=dtype) cov = torch.eye(4, device=device, dtype=dtype) mtmvn = MultitaskMultivariateNormal(mean=mean, covariance_matrix=cov) posterior = GPyTorchPosterior(mvn=mtmvn) for sample_shape in (torch.Size([5]), torch.Size([5, 3])): for qmc in (False, True): for seed in (None, 1234): expected_shape = sample_shape + torch.Size([2, 2]) samples = construct_base_samples_from_posterior( posterior=posterior, sample_shape=sample_shape, qmc=qmc, seed=seed, ) self.assertEqual(samples.shape, expected_shape) self.assertEqual(samples.device.type, device.type) self.assertEqual(samples.dtype, dtype) # multi-output, batch mode mean = torch.zeros(2, 2, 2, device=device, dtype=dtype) cov = torch.eye(4, device=device, dtype=dtype).expand(2, 4, 4) mtmvn = MultitaskMultivariateNormal(mean=mean, covariance_matrix=cov) posterior = GPyTorchPosterior(mvn=mtmvn) for sample_shape in (torch.Size([5]), torch.Size([5, 3])): for qmc in (False, True): for seed in (None, 1234): for collapse_batch_dims in (False, True): if collapse_batch_dims: expected_shape = sample_shape + torch.Size([1, 2, 2]) else: expected_shape = sample_shape + torch.Size([2, 2, 2]) samples = construct_base_samples_from_posterior( posterior=posterior, sample_shape=sample_shape, qmc=qmc, collapse_batch_dims=collapse_batch_dims, seed=seed, ) self.assertEqual(samples.shape, expected_shape) self.assertEqual(samples.device.type, device.type) self.assertEqual(samples.dtype, dtype)
def forward(self, x): mean = torch.zeros(torch.Size([x.size(0)]), dtype=x.dtype, device=x.device) return MultivariateNormal(mean, gpytorch.lazy.RootLazyTensor(x))
def test_degenerate_GPyTorchPosterior_Multitask(self): for dtype in (torch.float, torch.double): # singular covariance matrix degenerate_covar = torch.tensor([[1, 1, 0], [1, 1, 0], [0, 0, 2]], dtype=dtype, device=self.device) mean = torch.rand(3, dtype=dtype, device=self.device) mvn = MultivariateNormal(mean, lazify(degenerate_covar)) mvn = MultitaskMultivariateNormal.from_independent_mvns([mvn, mvn]) posterior = GPyTorchPosterior(mvn=mvn) # basics self.assertEqual(posterior.device.type, self.device.type) self.assertTrue(posterior.dtype == dtype) self.assertEqual(posterior.event_shape, torch.Size([3, 2])) mean_exp = mean.unsqueeze(-1).repeat(1, 2) self.assertTrue(torch.equal(posterior.mean, mean_exp)) variance_exp = degenerate_covar.diag().unsqueeze(-1).repeat(1, 2) self.assertTrue(torch.equal(posterior.variance, variance_exp)) # rsample with warnings.catch_warnings(record=True) as w: # we check that the p.d. warning is emitted - this only # happens once per posterior, so we need to check only once samples = posterior.rsample(sample_shape=torch.Size([4])) self.assertEqual(len(w), 1) self.assertTrue(issubclass(w[-1].category, RuntimeWarning)) self.assertTrue("not p.d." in str(w[-1].message)) self.assertEqual(samples.shape, torch.Size([4, 3, 2])) samples2 = posterior.rsample(sample_shape=torch.Size([4, 2])) self.assertEqual(samples2.shape, torch.Size([4, 2, 3, 2])) # rsample w/ base samples base_samples = torch.randn(4, 3, 2, device=self.device, dtype=dtype) samples_b1 = posterior.rsample(sample_shape=torch.Size([4]), base_samples=base_samples) samples_b2 = posterior.rsample(sample_shape=torch.Size([4]), base_samples=base_samples) self.assertTrue(torch.allclose(samples_b1, samples_b2)) base_samples2 = torch.randn(4, 2, 3, 2, device=self.device, dtype=dtype) samples2_b1 = posterior.rsample(sample_shape=torch.Size([4, 2]), base_samples=base_samples2) samples2_b2 = posterior.rsample(sample_shape=torch.Size([4, 2]), base_samples=base_samples2) self.assertTrue(torch.allclose(samples2_b1, samples2_b2)) # collapse_batch_dims b_mean = torch.rand(2, 3, dtype=dtype, device=self.device) b_degenerate_covar = degenerate_covar.expand( 2, *degenerate_covar.shape) b_mvn = MultivariateNormal(b_mean, lazify(b_degenerate_covar)) b_mvn = MultitaskMultivariateNormal.from_independent_mvns( [b_mvn, b_mvn]) b_posterior = GPyTorchPosterior(mvn=b_mvn) b_base_samples = torch.randn(4, 1, 3, 2, device=self.device, dtype=dtype) with warnings.catch_warnings(record=True) as w: b_samples = b_posterior.rsample(sample_shape=torch.Size([4]), base_samples=b_base_samples) self.assertEqual(len(w), 1) self.assertTrue(issubclass(w[-1].category, RuntimeWarning)) self.assertTrue("not p.d." in str(w[-1].message)) self.assertEqual(b_samples.shape, torch.Size([4, 2, 3, 2]))
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, **kwargs: Any, ) -> GPyTorchPosterior: self.eval() # make sure we're calling a posterior # input transforms are applied at `posterior` in `eval` mode, and at # `model.forward()` at the training time X = self.transform_inputs(X) no_pred_variance = skip_posterior_variances._state with ExitStack() as es: es.enter_context(gpt_posterior_settings()) es.enter_context(fast_pred_var(True)) # we need to skip posterior variances here es.enter_context(skip_posterior_variances(True)) mvn = self(X) if observation_noise is not False: # TODO: ensure that this still works for structured noise solves. mvn = self.likelihood(mvn, X) # lazy covariance matrix includes the interpolated version of the full # covariance matrix so we can actually grab that instead. if X.ndimension() > self.train_inputs[0].ndimension(): X_batch_shape = X.shape[:-2] train_inputs = self.train_inputs[0].reshape( *[1] * len(X_batch_shape), *self.train_inputs[0].shape ) train_inputs = train_inputs.repeat( *X_batch_shape, *[1] * self.train_inputs[0].ndimension() ) else: train_inputs = self.train_inputs[0] # we now compute the data covariances for the training data, the testing # data, the joint covariances, and the test train cross-covariance train_train_covar = self.prediction_strategy.lik_train_train_covar.detach() base_train_train_covar = train_train_covar.lazy_tensor data_train_covar = base_train_train_covar.lazy_tensors[0] data_covar = self.covar_modules[0] data_train_test_covar = data_covar(X, train_inputs) data_test_test_covar = data_covar(X) data_joint_covar = data_train_covar.cat_rows( cross_mat=data_train_test_covar, new_mat=data_test_test_covar, ) # we detach the latents so that they don't cause gradient errors # TODO: Can we enable backprop through the latent covariances? batch_shape = data_train_test_covar.batch_shape latent_covar_list = [] for latent_covar in base_train_train_covar.lazy_tensors[1:]: if latent_covar.batch_shape != batch_shape: latent_covar = BatchRepeatLazyTensor(latent_covar, batch_shape) latent_covar_list.append(latent_covar.detach()) joint_covar = KroneckerProductLazyTensor( data_joint_covar, *latent_covar_list ) test_train_covar = KroneckerProductLazyTensor( data_train_test_covar, *latent_covar_list ) # compute the posterior variance if necessary if no_pred_variance: pred_variance = mvn.variance else: pred_variance = self.make_posterior_variances(joint_covar) # mean and variance get reshaped into the target shape new_mean = mvn.mean.reshape(*X.shape[:-1], *self.target_shape) if not no_pred_variance: new_variance = pred_variance.reshape(*X.shape[:-1], *self.target_shape) new_variance = DiagLazyTensor(new_variance) else: new_variance = ZeroLazyTensor( *X.shape[:-1], *self.target_shape, self.target_shape[-1] ) mvn = MultivariateNormal(new_mean, new_variance) # return a specialized Posterior to allow for sampling # cloning the full covar allows backpropagation through it posterior = HigherOrderGPPosterior( mvn=mvn, train_targets=self.train_targets.unsqueeze(-1), train_train_covar=train_train_covar, test_train_covar=test_train_covar, joint_covariance_matrix=joint_covar.clone(), output_shape=X.shape[:-1] + self.target_shape, num_outputs=self._num_outputs, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior
def forward(self, state_input: Tensor) -> MultivariateNormal: """Forward call of GP class.""" mean_x = self.mean_module(state_input) covar_x = self.covar_module(state_input) return MultivariateNormal(mean_x, covar_x)
def forward(self, x: torch.Tensor) -> MultivariateNormal: mean_x = self.mean_module(x) covar_x = self.covar_module(x) return MultivariateNormal(mean_x, covar_x)
def test_added_diag_lt(self, N=10000, p=20, use_cuda=False, seed=1): torch.manual_seed(seed) if torch.cuda.is_available() and use_cuda: print("Using cuda") device = torch.device("cuda") torch.cuda.manual_seed_all(seed) else: device = torch.device("cpu") D = torch.randn(N, p, device=device) A = torch.randn(N, device=device).abs() * 1e-3 + 0.1 # this is a lazy tensor for DD' D_lt = RootLazyTensor(D) # this is a lazy tensor for diag(A) diag_term = DiagLazyTensor(A) # DD' + diag(A) lowrank_pdiag_lt = AddedDiagLazyTensor(diag_term, D_lt) # z \sim N(0,I), mean = 1 z = torch.randn(N, device=device) mean = torch.ones(N, device=device) diff = mean - z print(lowrank_pdiag_lt.log_det()) logdet = lowrank_pdiag_lt.log_det() inv_matmul = lowrank_pdiag_lt.inv_matmul(diff.unsqueeze(1)).squeeze(1) inv_matmul_quad = torch.dot(diff, inv_matmul) """inv_matmul_quad_qld, logdet_qld = lowrank_pdiag_lt.inv_quad_log_det(inv_quad_rhs=diff.unsqueeze(1), log_det = True) """ """from gpytorch.functions._inv_quad_log_det import InvQuadLogDet iqld_construct = InvQuadLogDet(gpytorch.lazy.lazy_tensor_representation_tree.LazyTensorRepresentationTree(lowrank_pdiag_lt), matrix_shape=lowrank_pdiag_lt.matrix_shape, dtype=lowrank_pdiag_lt.dtype, device=lowrank_pdiag_lt.device, inv_quad=True, log_det=True, preconditioner=lowrank_pdiag_lt._preconditioner()[0], log_det_correction=lowrank_pdiag_lt._preconditioner()[1]) inv_matmul_quad_qld, logdet_qld = iqld_construct(diff.unsqueeze(1))""" num_random_probes = gpytorch.settings.num_trace_samples.value() probe_vectors = torch.empty( lowrank_pdiag_lt.matrix_shape[-1], num_random_probes, dtype=lowrank_pdiag_lt.dtype, device=lowrank_pdiag_lt.device, ) probe_vectors.bernoulli_().mul_(2).add_(-1) probe_vector_norms = torch.norm(probe_vectors, 2, dim=-2, keepdim=True) probe_vectors = probe_vectors.div(probe_vector_norms) # diff_norm = diff.norm() # diff = diff/diff_norm rhs = torch.cat([diff.unsqueeze(1), probe_vectors], dim=1) solves, t_mat = gpytorch.utils.linear_cg( lowrank_pdiag_lt.matmul, rhs, n_tridiag=num_random_probes, max_iter=gpytorch.settings.max_cg_iterations.value(), max_tridiag_iter=gpytorch.settings. max_lanczos_quadrature_iterations.value(), preconditioner=lowrank_pdiag_lt._preconditioner()[0], ) # print(solves) inv_matmul_qld = solves[:, 0] # * diff_norm diff_solve = gpytorch.utils.linear_cg( lowrank_pdiag_lt.matmul, diff.unsqueeze(1), max_iter=gpytorch.settings.max_cg_iterations.value(), preconditioner=lowrank_pdiag_lt._preconditioner()[0], ) print("diff_solve_norm: ", diff_solve.norm()) print( "diff between multiple linear_cg: ", (inv_matmul_qld.unsqueeze(1) - diff_solve).norm() / diff_solve.norm(), ) eigenvalues, eigenvectors = gpytorch.utils.lanczos.lanczos_tridiag_to_diag( t_mat) slq = gpytorch.utils.StochasticLQ() log_det_term, = slq.evaluate( lowrank_pdiag_lt.matrix_shape, eigenvalues, eigenvectors, [lambda x: x.log()], ) logdet_qld = log_det_term + lowrank_pdiag_lt._preconditioner()[1] print("Log det difference: ", (logdet - logdet_qld).norm() / logdet.norm()) print( "inv matmul difference: ", (inv_matmul - inv_matmul_qld).norm() / inv_matmul_quad.norm(), ) # N(1, DD' + diag(A)) lazydist = MultivariateNormal(mean, lowrank_pdiag_lt) lazy_lprob = lazydist.log_prob(z) # exact log probability with Cholesky decomposition exact_dist = torch.distributions.MultivariateNormal( mean, lowrank_pdiag_lt.evaluate().float()) exact_lprob = exact_dist.log_prob(z) print(lazy_lprob, exact_lprob) rel_error = torch.norm(lazy_lprob - exact_lprob) / exact_lprob.norm() self.assertLess(rel_error.cpu().item(), 0.01)
def forward(self, *inputs: Tensor, **kwargs ) -> Tuple[List[MultivariateNormal], Tensor]: """Forward propagate the model. Parameters ---------- inputs: Tensor. output_sequence: Tensor. Tensor of output data [batch_size x sequence_length x dim_outputs]. input_sequence: Tensor. Tensor of input data [batch_size x sequence_length x dim_inputs]. Returns ------- output_distribution: List[Normal]. List of length sequence_length of distributions of size [batch_size x dim_outputs x num_particles] """ output_sequence, input_sequence = inputs num_particles = self.num_particles # dim_states = self.dim_states batch_size, sequence_length, dim_inputs = input_sequence.shape _, _, dim_outputs = output_sequence.shape ################################################################################ # SAMPLE GP # ################################################################################ self.forward_model.resample() self.backward_model.resample() ################################################################################ # PERFORM Backward Pass # ################################################################################ if self.training: output_distribution = self.backward(output_sequence, input_sequence) ################################################################################ # Initial State # ################################################################################ state = self.recognition(output_sequence[:, :self.recognition.length], input_sequence[:, :self.recognition.length], num_particles=num_particles) ################################################################################ # PREDICT Outputs # ################################################################################ outputs = [] y_pred = self.emissions(state) outputs.append(MultivariateNormal(y_pred.loc.detach(), y_pred.covariance_matrix.detach())) ################################################################################ # INITIALIZE losses # ################################################################################ # entropy = torch.tensor(0.) if self.training: output_distribution.pop(0) # entropy += y_tilde.entropy().mean() / sequence_length y = output_sequence[:, 0].expand(num_particles, batch_size, dim_outputs ).permute(1, 2, 0) log_lik = y_pred.log_prob(y).sum(dim=1).mean() # type: torch.Tensor l2 = ((y_pred.loc - y) ** 2).sum(dim=1).mean() # type: torch.Tensor kl_cond = torch.tensor(0.) for t in range(sequence_length - 1): ############################################################################ # PREDICT Next State # ############################################################################ u = input_sequence[:, t].expand(num_particles, batch_size, dim_inputs) u = u.permute(1, 2, 0) # Move last component to end. state_samples = state.rsample() state_input = torch.cat((state_samples, u), dim=1) next_f = self.forward_model(state_input) next_state = self.transitions(next_f) next_state.loc += state_samples if self.independent_particles: next_state = diagonal_covariance(next_state) ############################################################################ # CONDITION Next State # ############################################################################ if self.training: y_tilde = output_distribution.pop(0) p_next_state = next_state next_state = self._condition(next_state, y_tilde) kl_cond += kl_divergence(next_state, p_next_state).mean() ############################################################################ # RESAMPLE State # ############################################################################ state = next_state ############################################################################ # PREDICT Outputs # ############################################################################ y_pred = self.emissions(state) outputs.append(y_pred) ############################################################################ # COMPUTE Losses # ############################################################################ y = output_sequence[:, t + 1].expand( num_particles, batch_size, dim_outputs).permute(1, 2, 0) log_lik += y_pred.log_prob(y).sum(dim=1).mean() l2 += ((y_pred.loc - y) ** 2).sum(dim=1).mean() # entropy += y_tilde.entropy().mean() / sequence_length assert len(outputs) == sequence_length # if self.training: # del output_distribution ################################################################################ # Compute model KL divergences Divergences # ################################################################################ factor = 1 # batch_size / self.dataset_size kl_uf = self.forward_model.kl_divergence() kl_ub = self.backward_model.kl_divergence() if self.forward_model.independent: kl_uf *= sequence_length if self.backward_model.independent: kl_ub *= sequence_length kl_cond = kl_cond * self.loss_factors['kl_conditioning'] * factor kl_ub = kl_ub * self.loss_factors['kl_u'] * factor kl_uf = kl_uf * self.loss_factors['kl_u'] * factor if self.loss_key.lower() == 'loglik': loss = -log_lik elif self.loss_key.lower() == 'elbo': loss = -(log_lik - kl_uf - kl_ub - kl_cond) if kwargs.get('print', False): str_ = 'elbo: {}, log_lik: {}, kluf: {}, klub: {}, klcond: {}' print(str_.format(loss.item(), log_lik.item(), kl_uf.item(), kl_ub.item(), kl_cond.item())) elif self.loss_key.lower() == 'l2': loss = l2 elif self.loss_key.lower() == 'rmse': loss = torch.sqrt(l2) else: raise NotImplementedError("Key {} not implemented".format(self.loss_key)) return outputs, loss
def forward(self, input): mean = self.mean_module(input) covar = self.covar_module(input) return MultivariateNormal(mean, covar)
def test_GPyTorchPosterior(self): for dtype in (torch.float, torch.double): n = 3 mean = torch.rand(n, dtype=dtype, device=self.device) variance = 1 + torch.rand(n, dtype=dtype, device=self.device) covar = variance.diag() mvn = MultivariateNormal(mean, lazify(covar)) posterior = GPyTorchPosterior(mvn=mvn) # basics self.assertEqual(posterior.device.type, self.device.type) self.assertTrue(posterior.dtype == dtype) self.assertEqual(posterior.event_shape, torch.Size([n, 1])) self.assertTrue(torch.equal(posterior.mean, mean.unsqueeze(-1))) self.assertTrue( torch.equal(posterior.variance, variance.unsqueeze(-1))) # rsample samples = posterior.rsample() self.assertEqual(samples.shape, torch.Size([1, n, 1])) for sample_shape in ([4], [4, 2]): samples = posterior.rsample( sample_shape=torch.Size(sample_shape)) self.assertEqual(samples.shape, torch.Size(sample_shape + [n, 1])) # check enabling of approximate root decomposition with ExitStack() as es: mock_func = es.enter_context( mock.patch(ROOT_DECOMP_PATH, return_value=torch.linalg.cholesky(covar))) es.enter_context(gpt_settings.max_cholesky_size(0)) es.enter_context( gpt_settings.fast_computations( covar_root_decomposition=True)) # need to clear cache, cannot re-use previous objects mvn = MultivariateNormal(mean, lazify(covar)) posterior = GPyTorchPosterior(mvn=mvn) posterior.rsample(sample_shape=torch.Size([4])) mock_func.assert_called_once() # rsample w/ base samples base_samples = torch.randn(4, 3, 1, device=self.device, dtype=dtype) # incompatible shapes with self.assertRaises(RuntimeError): posterior.rsample(sample_shape=torch.Size([3]), base_samples=base_samples) # ensure consistent result for sample_shape in ([4], [4, 2]): base_samples = torch.randn(*sample_shape, 3, 1, device=self.device, dtype=dtype) samples = [ posterior.rsample(sample_shape=torch.Size(sample_shape), base_samples=base_samples) for _ in range(2) ] self.assertTrue(torch.allclose(*samples)) # collapse_batch_dims b_mean = torch.rand(2, 3, dtype=dtype, device=self.device) b_variance = 1 + torch.rand(2, 3, dtype=dtype, device=self.device) b_covar = torch.diag_embed(b_variance) b_mvn = MultivariateNormal(b_mean, lazify(b_covar)) b_posterior = GPyTorchPosterior(mvn=b_mvn) b_base_samples = torch.randn(4, 1, 3, 1, device=self.device, dtype=dtype) b_samples = b_posterior.rsample(sample_shape=torch.Size([4]), base_samples=b_base_samples) self.assertEqual(b_samples.shape, torch.Size([4, 2, 3, 1]))
def forward(self, input): mean = self.mean_module(input) covar = self.covar_module(input) return MultitaskMultivariateNormal.from_batch_mvn( MultivariateNormal(mean, covar))
def forward(self, x): mean = self.mean(x) covar = self.covariance(x) return MultivariateNormal(mean, covar)
def forward(self, x): x_mean = self.mean(x) x_covar = self.covar(x) return MultivariateNormal(x_mean, x_covar)
def gpnet(args, dataloader, test_x, prior_gp): N = len(dataloader.dataset) x_dim = 1 prior_gp.train() if args.net == 'tangent': kernel = prior_gp.covar_module bnn_prev = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'deep': kernel = prior_gp.covar_module bnn_prev = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'rf': kernel = ScaleKernel(RBFKernel()) kernel_prev = ScaleKernel(RBFKernel()) bnn_prev = RFExpansion(x_dim, args.n_hidden, kernel_prev, mvn=False, fix_ls=args.fix_rf_ls, residual=args.residual) bnn = RFExpansion(x_dim, args.n_hidden, kernel, fix_ls=args.fix_rf_ls, residual=args.residual) bnn_prev.load_state_dict(bnn.state_dict()) else: raise NotImplementedError('Unknown inference net') bnn = bnn.to(args.device) bnn_prev = bnn_prev.to(args.device) prior_gp = prior_gp.to(args.device) infer_gpnet_optimizer = optim.Adam(bnn.parameters(), lr=args.learning_rate) hyper_opt_optimizer = optim.Adam(prior_gp.parameters(), lr=args.hyper_rate) x_min, x_max = dataloader.dataset.range bnn.train() bnn_prev.train() prior_gp.train() mb = master_bar(range(1, args.n_iters + 1)) for t in mb: # Hyperparameter selection beta = args.beta0 * 1. / (1. + args.gamma * math.sqrt(t - 1)) dl_bar = progress_bar(dataloader, parent=mb) for x, y in dl_bar: observed_size = x.size(0) x, y = x.to(args.device), y.to(args.device) x_star = torch.Tensor(args.measurement_size, x_dim).uniform_(x_min, x_max).to(args.device) # [Batch + Measurement Points x x_dims] xx = torch.cat([x, x_star], 0) infer_gpnet_optimizer.zero_grad() hyper_opt_optimizer.zero_grad() # inference net # Eq.(6) Prior p(f) # \mu_1=0, \Sigma_1 mean_prior = torch.zeros(observed_size).to(args.device) K_prior = kernel(xx, xx).add_jitter(1e-6) # q_{\gamma_t}(f_M, f_n) = Normal(mu_2, sigma_2|x_n, x_m) # \mu_2, \Sigma_2 qff_mean_prev, K_prox = bnn_prev(xx) # Eq.(8) adapt prior; p(f)^\beta x q(f)^{1 - \beta} mean_adapt, K_adapt = product_gaussians(mu1=mean_prior, sigma1=K_prior, mu2=qff_mean_prev, sigma2=K_prox, beta=beta) # Eq.(8) (mean_n, mean_m), (Knn, Knm, Kmm) = split_gaussian(mean_adapt, K_adapt, observed_size) # Eq.(2) K_{D,D} + noise / (N\beta_t) Ky = Knn + torch.eye(observed_size).to( args.device) * prior_gp.likelihood.noise / (N / observed_size * beta) Ky_tril = torch.cholesky(Ky) # Eq.(2) mean_target = Knm.t().mm(cholesky_solve(y - mean_n, Ky_tril)) + mean_m mean_target = mean_target.squeeze(-1) K_target = gpytorch.add_jitter( Kmm - Knm.t().mm(cholesky_solve(Knm, Ky_tril)), 1e-6) # \hat{q}_{t+1} (f_M) target_pf_star = MultivariateNormal(mean_target, K_target) # q_\gamma (f_M) qf_star = bnn(x_star) # Eq. (11) kl_obj = kl_div(qf_star, target_pf_star).sum() kl_obj.backward(retain_graph=True) infer_gpnet_optimizer.step() # Hyper paramter update (mean_n_prior, _), (Kn_prior, _, _) = split_gaussian(mean_prior, K_prior, observed_size) pf = MultivariateNormal(mean_n_prior, Kn_prior) (qf_prev_mean, _), (Kn_prox, _, _) = split_gaussian(qff_mean_prev, K_prox, observed_size) qf_prev = MultivariateNormal(qf_prev_mean, Kn_prox) hyper_obj = -(prior_gp.likelihood.expected_log_prob( y.squeeze(-1), qf_prev) - kl_div(qf_prev, pf)) hyper_obj.backward(retain_graph=True) hyper_opt_optimizer.step() mb.child.comment = "kl_obj = {:.3f}, obs_var={:.3f}".format( kl_obj.item(), prior_gp.likelihood.noise.item()) # update q_{\gamma_t} to q_{\gamma_{t+1}} bnn_prev.load_state_dict(bnn.state_dict()) if args.net == 'rf': kernel_prev.load_state_dict(kernel.state_dict()) if t % 50 == 0: mb.write("Iter {}/{}, kl_obj = {:.4f}, noise = {:.4f}".format( t, args.n_iters, kl_obj.item(), prior_gp.likelihood.noise.item())) test_x = test_x.to(args.device) test_stats = evaluate(bnn, prior_gp.likelihood, test_x, args.net == 'tangent') return test_stats
def _get_test_posterior(device, dtype=torch.float): mean = torch.zeros(2, device=device, dtype=dtype) cov = torch.eye(2, device=device, dtype=dtype) mvn = MultivariateNormal(mean, cov) return GPyTorchPosterior(mvn)
def gpnet_nonconj(args, dataloader, test_x, prior_gp): N = len(dataloader.dataset) x_dim = 1 prior_gp.train() if args.net == 'tangent': kernel = prior_gp.covar_module bnn_prev = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = FirstOrder([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'deep': kernel = prior_gp.covar_module bnn_prev = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=False) bnn = DeepKernel([x_dim] + [args.n_hidden] * args.n_layer, mvn=True) elif args.net == 'rf': kernel = ScaleKernel(RBFKernel()) kernel_prev = ScaleKernel(RBFKernel()) bnn_prev = RFExpansion(x_dim, args.n_hidden, kernel_prev, mvn=False, fix_ls=args.fix_rf_ls, residual=args.residual) bnn = RFExpansion(x_dim, args.n_hidden, kernel, fix_ls=args.fix_rf_ls, residual=args.residual) bnn_prev.load_state_dict(bnn.state_dict()) else: raise NotImplementedError('Unknown inference net') infer_gpnet_optimizer = optim.Adam(bnn.parameters(), lr=args.learning_rate) hyper_opt_optimizer = optim.Adam(prior_gp.parameters(), lr=args.hyper_rate) x_min, x_max = dataloader.dataset.range n = dataloader.batch_size bnn.train() bnn_prev.train() prior_gp.train() mb = master_bar(range(1, args.n_iters + 1)) for t in mb: beta = args.beta0 * 1. / (1. + args.gamma * math.sqrt(t - 1)) dl_bar = progress_bar(dataloader, parent=mb) for x, y in dl_bar: n = x.size(0) x_star = torch.Tensor(args.measurement_size, x_dim).uniform_(x_min, x_max) xx = torch.cat([x, x_star], 0) # inference net infer_gpnet_optimizer.zero_grad() hyper_opt_optimizer.zero_grad() qff = bnn(xx) qff_mean_prev, K_prox = bnn_prev(xx) qf_mean, qf_var = bnn(x, full_cov=False) # Eq.(8) K_prior = kernel(xx, xx).add_jitter(1e-6) pff = MultivariateNormal(torch.zeros(xx.size(0)), K_prior) f_term = expected_log_prob(prior_gp.likelihood, qf_mean, qf_var, y.squeeze(-1)) f_term = torch.sum( expected_log_prob(prior_gp.likelihood, qf_mean, qf_var, y.squeeze(-1))) f_term *= N / x.size(0) * beta prior_term = -beta * cross_entropy(qff, pff) qff_prev = MultivariateNormal(qff_mean_prev, K_prox) prox_term = -(1 - beta) * cross_entropy(qff, qff_prev) entropy_term = entropy(qff) lower_bound = f_term + prior_term + prox_term + entropy_term loss = -lower_bound / n loss.backward(retain_graph=True) infer_gpnet_optimizer.step() # Hyper-parameter update Kn_prior = K_prior[:n, :n] pf = MultivariateNormal(torch.zeros(n), Kn_prior) Kn_prox = K_prox[:n, :n] qf_prev_mean = qff_mean_prev[:n] qf_prev_var = torch.diagonal(Kn_prox) qf_prev = MultivariateNormal(qf_prev_mean, Kn_prior) hyper_obj = expected_log_prob( prior_gp.likelihood, qf_prev_mean, qf_prev_var, y.squeeze(-1)).sum() - kl_div(qf_prev, pf) hyper_obj = -hyper_obj hyper_obj.backward() hyper_opt_optimizer.step() bnn_prev.load_state_dict(bnn.state_dict()) if args.net == 'rf': kernel_prev.load_state_dict(kernel.state_dict()) if t % 50 == 0: mb.write("Iter {}/{}, kl_obj = {:.4f}, noise = {:.4f}".format( t, args.n_iters, lower_bound.item(), prior_gp.likelihood.noise.item())) test_x = test_x.to(args.device) test_stats = evaluate(bnn, prior_gp.likelihood, test_x, args.net == 'tangent') return test_stats
def forward(self, x): features = self.feature_extractor(x) mean_x = self.mean_module(features) covar_x = self.covar_module(features) return MultivariateNormal(mean_x, covar_x)
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, **kwargs: Any, ) -> GPyTorchPosterior: r"""Computes the posterior over model outputs at the provided points. Args: X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension of the feature space and `q` is the number of points considered jointly. output_indices: A list of indices, corresponding to the outputs over which to compute the posterior (if the model is multi-output). Can be used to speed up computation if only a subset of the model's outputs are required for optimization. If omitted, computes the posterior over all model outputs. observation_noise: If True, add the observation noise from the likelihood to the posterior. If a Tensor, use it directly as the observation noise (must be of shape `(batch_shape) x q x m`). Returns: A `GPyTorchPosterior` object, representing `batch_shape` joint distributions over `q` points and the outputs selected by `output_indices` each. Includes observation noise if specified. """ self.eval() # make sure model is in eval mode with gpt_posterior_settings(): # insert a dimension for the output dimension if self._num_outputs > 1: X, output_dim_idx = add_output_dim( X=X, original_batch_shape=self._input_batch_shape) mvn = self(X) if observation_noise is not False: if torch.is_tensor(observation_noise): # TODO: Validate noise shape # make observation_noise `batch_shape x q x n` obs_noise = observation_noise.transpose(-1, -2) mvn = self.likelihood(mvn, X, noise=obs_noise) elif isinstance(self.likelihood, FixedNoiseGaussianLikelihood): # Use the mean of the previous noise values (TODO: be smarter here). noise = self.likelihood.noise.mean().expand(X.shape[:-1]) mvn = self.likelihood(mvn, X, noise=noise) else: mvn = self.likelihood(mvn, X) if self._num_outputs > 1: mean_x = mvn.mean covar_x = mvn.covariance_matrix output_indices = output_indices or range(self._num_outputs) mvns = [ MultivariateNormal( mean_x.select(dim=output_dim_idx, index=t), lazify(covar_x.select(dim=output_dim_idx, index=t)), ) for t in output_indices ] mvn = MultitaskMultivariateNormal.from_independent_mvns( mvns=mvns) posterior = GPyTorchPosterior(mvn=mvn) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior
def _initialize_latents( self, latent_init: str, num_latent_dims: List[int], learn_latent_pars: bool, device: torch.device, dtype: torch.dtype, ): self.latent_parameters = ParameterList() if latent_init == "default": for dim_num in range(len(self.covar_modules) - 1): self.latent_parameters.append( Parameter( torch.rand( *self._aug_batch_shape, self.target_shape[dim_num], num_latent_dims[dim_num], device=device, dtype=dtype, ), requires_grad=learn_latent_pars, )) elif latent_init == "gp": for dim_num, covar in enumerate(self.covar_modules[1:]): latent_covar = covar( torch.linspace( 0.0, 1.0, self.target_shape[dim_num], device=device, dtype=dtype, )).add_jitter(1e-4) latent_dist = MultivariateNormal( torch.zeros( self.target_shape[dim_num], device=device, dtype=dtype, ), latent_covar, ) sample_shape = torch.Size(( *self._aug_batch_shape, num_latent_dims[dim_num], )) latent_sample = latent_dist.sample(sample_shape=sample_shape) latent_sample = latent_sample.reshape( *self._aug_batch_shape, self.target_shape[dim_num], num_latent_dims[dim_num], ) self.latent_parameters.append( Parameter( latent_sample, requires_grad=learn_latent_pars, )) self.register_prior( "latent_parameters_" + str(dim_num), MultivariateNormalPrior( latent_dist.loc, latent_dist.covariance_matrix.detach().clone()), lambda module, dim_num=dim_num: self.latent_parameters[ dim_num], )
def forward(self, x): x = map_box_ball(x, self.dim) mean_x = self.mean_module(x) covar_x = self.covar_module(x) return MultivariateNormal(mean_x, covar_x)
def _create_marginal_input(self, batch_shape=torch.Size()): mat = torch.randn(*batch_shape, 5, 5) eye = torch.diag_embed(torch.ones(*batch_shape, 5)) return MultivariateNormal(torch.randn(*batch_shape, 5), mat @ mat.transpose(-1, -2) + eye)
def forward(self, x): """ApproximateGPModelのforwardメソッド """ mean_x = self.mean_module(x) covar_x = self.covar_module(x) return MultivariateNormal(mean_x, covar_x)
def forward(self, x): mean_x = self.mean_module(x) covar_x = self.covar_module(x) return MultivariateNormal(mean_x, covar_x)
def _create_marginal_input(self, batch_shape=torch.Size([])): mat = torch.randn(*batch_shape, 6, 5, 5) return MultivariateNormal(torch.randn(*batch_shape, 6, 5), mat @ mat.transpose(-1, -2))