def forward(self, x1, x2=None, diag=False, last_dim_is_batch=False, **params): # TODO: figure out how to have multiple outputs - i believe method in # def num_ouptuts_per_input -> but also for the kernel model itself #print('ldib', last_dim_is_batch) if len(x1.shape) is 3: x1 = x1[0] if x2 is not None: if len(x2.shape) is 3: x2 = x2[0] #print(self.batch_shape, x2 is None) if len(x1.shape) is 2: x1 = x1.view(-1, *self.shape) # TODO: figure out how to have batched dimensional outputs if x2 is None: kernel = lazify(self.model(x1, diag=diag)) else: x2 = x2.view(-1, *self.shape) kernel = lazify(self.model(x1, x2, diag=diag)) #print('shape of kernel is: ', kernel.shape) res = BatchRepeatLazyTensor(kernel, batch_repeat=self.batch_shape) if last_dim_is_batch: res = res.permute(1, 2, 0) return res
def create_lazy_tensor(self): toeplitz_column = torch.tensor([[4, 0, 0, 1], [3, 0, -0.5, -1]], dtype=torch.float) toeplitz_column.detach_() return BatchRepeatLazyTensor(ToeplitzLazyTensor(toeplitz_column), torch.Size((3, ))) return BatchRepeatLazyTensor(ToeplitzLazyTensor(toeplitz_column), torch.Size((3, )))
def forward(self, X: Tensor) -> MultivariateNormal: X = self.transform_inputs(X) covariance_list = [] covariance_list.append(self.covar_modules[0](X)) for cm, param in zip(self.covar_modules[1:], self.latent_parameters): covariance_list.append(cm(param)) # check batch_shapes if covariance_list[0].batch_shape != covariance_list[1].batch_shape: for i in range(1, len(covariance_list)): cm = covariance_list[i] covariance_list[i] = BatchRepeatLazyTensor( cm, covariance_list[0].batch_shape ) kronecker_covariance = KroneckerProductLazyTensor(*covariance_list) # TODO: expand options for the mean module via batch shaping? mean = torch.zeros( *covariance_list[0].batch_shape, kronecker_covariance.shape[-1], device=kronecker_covariance.device, dtype=kronecker_covariance.dtype, ) return MultivariateNormal(mean, kronecker_covariance)
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, posterior_transform: Optional[PosteriorTransform] = None, **kwargs: Any, ) -> MultitaskGPPosterior: self.eval() if posterior_transform is not None: # this could be very costly, disallow for now raise NotImplementedError( "Posterior transforms currently not supported for " f"{self.__class__.__name__}") X = self.transform_inputs(X) train_x = self.transform_inputs(self.train_inputs[0]) # construct Ktt task_covar = self._task_covar_matrix task_rootlt = self._task_covar_matrix.root_decomposition( method="diagonalization") task_root = task_rootlt.root if task_covar.batch_shape != X.shape[:-2]: task_covar = BatchRepeatLazyTensor(task_covar, batch_repeat=X.shape[:-2]) task_root = BatchRepeatLazyTensor(lazify(task_root), batch_repeat=X.shape[:-2]) task_covar_rootlt = RootLazyTensor(task_root) # construct RR' \approx Kxx data_data_covar = self.train_full_covar.lazy_tensors[0] # populate the diagonalziation caches for the root and inverse root # decomposition data_data_evals, data_data_evecs = data_data_covar.diagonalization() # pad the eigenvalue and eigenvectors with zeros if we are using lanczos if data_data_evecs.shape[-1] < data_data_evecs.shape[-2]: cols_to_add = data_data_evecs.shape[-2] - data_data_evecs.shape[-1] zero_evecs = torch.zeros( *data_data_evecs.shape[:-1], cols_to_add, dtype=data_data_evals.dtype, device=data_data_evals.device, ) zero_evals = torch.zeros( *data_data_evecs.shape[:-2], cols_to_add, dtype=data_data_evals.dtype, device=data_data_evals.device, ) data_data_evecs = CatLazyTensor( data_data_evecs, lazify(zero_evecs), dim=-1, output_device=data_data_evals.device, ) data_data_evals = torch.cat((data_data_evals, zero_evals), dim=-1) # construct K_{xt, x} test_data_covar = self.covar_module.data_covar_module(X, train_x) # construct K_{xt, xt} test_test_covar = self.covar_module.data_covar_module(X) # now update root so that \tilde{R}\tilde{R}' \approx K_{(x,xt), (x,xt)} # cloning preserves the gradient history updated_lazy_tensor = data_data_covar.cat_rows( cross_mat=test_data_covar.clone(), new_mat=test_test_covar, method="diagonalization", ) updated_root = updated_lazy_tensor.root_decomposition().root # occasionally, there's device errors so enforce this comes out right updated_root = updated_root.to(data_data_covar.device) # build a root decomposition of the joint train/test covariance matrix # construct (\tilde{R} \otimes M)(\tilde{R} \otimes M)' \approx # (K_{(x,xt), (x,xt)} \otimes Ktt) joint_covar = RootLazyTensor( KroneckerProductLazyTensor(updated_root, task_covar_rootlt.root.detach())) # construct K_{xt, x} \otimes Ktt test_obs_kernel = KroneckerProductLazyTensor(test_data_covar, task_covar) # collect y - \mu(x) and \mu(X) train_diff = self.train_targets - self.mean_module(train_x) if detach_test_caches.on(): train_diff = train_diff.detach() test_mean = self.mean_module(X) train_noise = self.likelihood._shaped_noise_covar(train_x.shape) diagonal_noise = isinstance(train_noise, DiagLazyTensor) if detach_test_caches.on(): train_noise = train_noise.detach() test_noise = (self.likelihood._shaped_noise_covar(X.shape) if observation_noise else None) # predictive mean and variance for the mvn # first the predictive mean pred_mean = (test_obs_kernel.matmul( self.predictive_mean_cache).reshape_as(test_mean) + test_mean) # next the predictive variance, assume diagonal noise test_var_term = KroneckerProductLazyTensor(test_test_covar, task_covar).diag() if diagonal_noise: task_evals, task_evecs = self._task_covar_matrix.diagonalization() # TODO: make this be the default KPMatmulLT diagonal method in gpytorch full_data_inv_evals = (KroneckerProductDiagLazyTensor( DiagLazyTensor(data_data_evals), DiagLazyTensor(task_evals)) + train_noise).inverse() test_train_hadamard = KroneckerProductLazyTensor( test_data_covar.matmul(data_data_evecs).evaluate()**2, task_covar.matmul(task_evecs).evaluate()**2, ) data_var_term = test_train_hadamard.matmul( full_data_inv_evals).sum(dim=-1) else: # if non-diagonal noise (but still kronecker structured), we have to pull # across the noise because the inverse is not closed form # should be a kronecker lt, R = \Sigma_X^{-1/2} \kron \Sigma_T^{-1/2} # TODO: enforce the diagonalization to return a KPLT for all shapes in # gpytorch or dense linear algebra for small shapes data_noise, task_noise = train_noise.lazy_tensors data_noise_root = data_noise.root_inv_decomposition( method="diagonalization") task_noise_root = task_noise.root_inv_decomposition( method="diagonalization") # ultimately we need to compute the diagonal of # (K_{x* X} \kron K_T)(K_{XX} \kron K_T + \Sigma_X \kron \Sigma_T)^{-1} # (K_{x* X} \kron K_T)^T # = (K_{x* X} \Sigma_X^{-1/2} Q_R)(\Lambda_R + I)^{-1} # (K_{x* X} \Sigma_X^{-1/2} Q_R)^T # where R = (\Sigma_X^{-1/2T}K_{XX}\Sigma_X^{-1/2} \kron # \Sigma_T^{-1/2T}K_{T}\Sigma_T^{-1/2}) # first we construct the components of R's eigen-decomposition # TODO: make this be the default KPMatmulLT diagonal method in gpytorch whitened_data_covar = (data_noise_root.transpose( -1, -2).matmul(data_data_covar).matmul(data_noise_root)) w_data_evals, w_data_evecs = whitened_data_covar.diagonalization() whitened_task_covar = (task_noise_root.transpose(-1, -2).matmul( self._task_covar_matrix).matmul(task_noise_root)) w_task_evals, w_task_evecs = whitened_task_covar.diagonalization() # we add one to the eigenvalues as above (not just for stability) full_data_inv_evals = (KroneckerProductDiagLazyTensor( DiagLazyTensor(w_data_evals), DiagLazyTensor(w_task_evals)).add_jitter(1.0).inverse()) test_data_comp = (test_data_covar.matmul(data_noise_root).matmul( w_data_evecs).evaluate()**2) task_comp = (task_covar.matmul(task_noise_root).matmul( w_task_evecs).evaluate()**2) test_train_hadamard = KroneckerProductLazyTensor( test_data_comp, task_comp) data_var_term = test_train_hadamard.matmul( full_data_inv_evals).sum(dim=-1) pred_variance = test_var_term - data_var_term specialized_mvn = MultitaskMultivariateNormal( pred_mean, DiagLazyTensor(pred_variance)) if observation_noise: specialized_mvn = self.likelihood(specialized_mvn) posterior = MultitaskGPPosterior( mvn=specialized_mvn, joint_covariance_matrix=joint_covar, test_train_covar=test_obs_kernel, train_diff=train_diff, test_mean=test_mean, train_train_covar=self.train_full_covar, train_noise=train_noise, test_noise=test_noise, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, **kwargs: Any, ) -> GPyTorchPosterior: self.eval() # make sure we're calling a posterior no_pred_variance = skip_posterior_variances._state with ExitStack() as es: es.enter_context(gpt_posterior_settings()) es.enter_context(fast_pred_var(True)) # we need to skip posterior variances here es.enter_context(skip_posterior_variances(True)) mvn = self(X) if observation_noise is not False: # TODO: implement Kronecker + diagonal solves so that this is possible. # if torch.is_tensor(observation_noise): # # TODO: Validate noise shape # # make observation_noise `batch_shape x q x n` # obs_noise = observation_noise.transpose(-1, -2) # mvn = self.likelihood(mvn, X, noise=obs_noise) # elif isinstance(self.likelihood, FixedNoiseGaussianLikelihood): # noise = self.likelihood.noise.mean().expand(X.shape[:-1]) # mvn = self.likelihood(mvn, X, noise=noise) # else: mvn = self.likelihood(mvn, X) # lazy covariance matrix includes the interpolated version of the full # covariance matrix so we can actually grab that instead. if X.ndimension() > self.train_inputs[0].ndimension(): X_batch_shape = X.shape[:-2] train_inputs = self.train_inputs[0].reshape( *[1] * len(X_batch_shape), *self.train_inputs[0].shape ) train_inputs = train_inputs.repeat( *X_batch_shape, *[1] * self.train_inputs[0].ndimension() ) else: train_inputs = self.train_inputs[0] full_covar = self.covar_modules[0](torch.cat((train_inputs, X), dim=-2)) if no_pred_variance: pred_variance = mvn.variance else: joint_covar = self._get_joint_covariance([X]) pred_variance = self.make_posterior_variances(joint_covar) full_covar = KroneckerProductLazyTensor( full_covar, *joint_covar.lazy_tensors[1:] ) joint_covar_list = [self.covar_modules[0](X, train_inputs)] batch_shape = joint_covar_list[0].batch_shape for cm, param in zip(self.covar_modules[1:], self.latent_parameters): covar = cm(param) if covar.batch_shape != batch_shape: covar = BatchRepeatLazyTensor(covar, batch_shape) joint_covar_list.append(covar) test_train_covar = KroneckerProductLazyTensor(*joint_covar_list) # mean and variance get reshaped into the target shape new_mean = mvn.mean.reshape(*X.shape[:-1], *self.target_shape) if not no_pred_variance: new_variance = pred_variance.reshape(*X.shape[:-1], *self.target_shape) new_variance = DiagLazyTensor(new_variance) else: new_variance = ZeroLazyTensor( *X.shape[:-1], *self.target_shape, self.target_shape[-1] ) mvn = MultivariateNormal(new_mean, new_variance) # return a specialized Posterior to allow for sampling posterior = HigherOrderGPPosterior( mvn=mvn, train_targets=self.train_targets.unsqueeze(-1), train_train_covar=self.prediction_strategy.lik_train_train_covar, test_train_covar=test_train_covar, joint_covariance_matrix=full_covar, output_shape=Size( ( *X.shape[:-1], *self.target_shape, ) ), num_outputs=self._num_outputs, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior
def forward(self, x): """Forward propagate the module. This method determines how to marginalize out the inducing function values. Specifically, forward defines how to transform a variational distribution over the inducing point values, q(u), in to a variational distribution over the function values at specified locations x, q(f|x), by integrating p(f|x, u)q(u)du Parameters ---------- x (torch.tensor): Locations x to get the variational posterior of the function values at. Returns ------- The distribution q(f|x) """ variational_dist = self.variational_distribution.approx_variational_distribution inducing_points = self.inducing_points inducing_batch_shape = inducing_points.shape[:-2] if inducing_batch_shape < x.shape[:-2] or len( inducing_batch_shape) < len(x.shape[:-2]): batch_shape = _mul_broadcast_shape(inducing_points.shape[:-2], x.shape[:-2]) inducing_points = inducing_points.expand( *batch_shape, *inducing_points.shape[-2:]) x = x.expand(*batch_shape, *x.shape[-2:]) variational_dist = variational_dist.expand(batch_shape) # If our points equal the inducing points, we're done if torch.equal(x, inducing_points): return variational_dist # Otherwise, we have to marginalize else: num_induc = inducing_points.size(-2) full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix # Mean terms test_mean = full_mean[..., num_induc:] induc_mean = full_mean[..., :num_induc] mean_diff = (variational_dist.mean - induc_mean).unsqueeze(-1) # Covariance terms induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] aux = variational_dist.lazy_covariance_matrix.root_decomposition() root_variational_covar = aux.root.evaluate() # If we had to expand the inducing points, # shrink the inducing mean and induc_induc_covar dimension # This makes everything more computationally efficient if len(inducing_batch_shape) < len(induc_induc_covar.batch_shape): index = tuple(0 for _ in range( len(induc_induc_covar.batch_shape) - len(inducing_batch_shape))) repeat_size = torch.Size( (tuple(induc_induc_covar.batch_shape[:len(index)]) + tuple( 1 for _ in induc_induc_covar.batch_shape[len(index):]))) induc_induc_covar = BatchRepeatLazyTensor( induc_induc_covar.__getitem__(index), repeat_size) # If we're less than a certain size, we'll compute the Cholesky # decomposition of induc_induc_covar cholesky = False if settings.fast_computations.log_prob.off() or ( num_induc <= settings.max_cholesky_size.value()): induc_induc_covar = CholLazyTensor( induc_induc_covar.cholesky()) cholesky = True # If we are making predictions and don't need variances, we can do things # very quickly. if not self.training and settings.skip_posterior_variances.on(): if not hasattr(self, "_mean_cache"): self._mean_cache = induc_induc_covar.inv_matmul( mean_diff).detach() predictive_mean = torch.add( test_mean, induc_data_covar.transpose(-2, -1).matmul( self._mean_cache).squeeze(-1)) predictive_covar = ZeroLazyTensor(test_mean.size(-1), test_mean.size(-1)) return MultivariateNormal(predictive_mean, predictive_covar) # Cache the CG results # For now: run variational inference without a preconditioner # The preconditioner screws things up for some reason with settings.max_preconditioner_size(0): # Cache the CG results left_tensors = torch.cat([mean_diff, root_variational_covar], -1) with torch.no_grad(): eager_rhs = torch.cat([left_tensors, induc_data_covar], -1) solve, probe_vecs, probe_vec_norms, probe_vec_solves, tmats = \ CachedCGLazyTensor.precompute_terms( induc_induc_covar, eager_rhs.detach(), logdet_terms=(not cholesky), include_tmats=(not settings.skip_logdet_forward.on() and not cholesky) ) eager_rhss = [ eager_rhs.detach(), eager_rhs[..., left_tensors.size(-1):].detach(), eager_rhs[..., :left_tensors.size(-1)].detach() ] solves = [ solve.detach(), solve[..., left_tensors.size(-1):].detach(), solve[..., :left_tensors.size(-1)].detach() ] if settings.skip_logdet_forward.on(): eager_rhss.append( torch.cat([probe_vecs, left_tensors], -1)) solves.append( torch.cat([ probe_vec_solves, solve[..., :left_tensors.size(-1)] ], -1)) induc_induc_covar = CachedCGLazyTensor( induc_induc_covar, eager_rhss=eager_rhss, solves=solves, probe_vectors=probe_vecs, probe_vector_norms=probe_vec_norms, probe_vector_solves=probe_vec_solves, probe_vector_tmats=tmats, ) if self.training: self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) # Compute predictive mean/covariance inv_products = induc_induc_covar.inv_matmul( induc_data_covar, left_tensors.transpose(-1, -2)) predictive_mean = torch.add(test_mean, inv_products[..., 0, :]) predictive_covar = RootLazyTensor(inv_products[..., 1:, :].transpose( -1, -2)) if self.training: interp_data_data_var, _ = induc_induc_covar.inv_quad_logdet( induc_data_covar, logdet=False, reduce_inv_quad=False) data_covariance = DiagLazyTensor( (data_data_covar.diag() - interp_data_data_var).clamp( 0, math.inf)) else: neg_induc_data_data_covar = torch.matmul( induc_data_covar.transpose(-1, -2).mul(-1), induc_induc_covar.inv_matmul(induc_data_covar)) data_covariance = data_data_covar + neg_induc_data_data_covar predictive_covar = PsdSumLazyTensor(predictive_covar, data_covariance) return MultivariateNormal(predictive_mean, predictive_covar)
def create_lazy_tensor(self): toeplitz_column = torch.tensor([[4, 0, 0, 1], [3, 0, -0.5, -1]], dtype=torch.float, requires_grad=True) return BatchRepeatLazyTensor(ToeplitzLazyTensor(toeplitz_column), torch.Size((3, )))
def create_lazy_tensor(self): toeplitz_column = torch.tensor([4, 0.1, 0.05, 0.01, 0.0], dtype=torch.float, requires_grad=True) return BatchRepeatLazyTensor(ToeplitzLazyTensor(toeplitz_column), torch.Size((3, )))
def create_lazy_tensor(self): toeplitz_column = torch.tensor([4, 0.1, 0.05, 0.01, 0.0], dtype=torch.float) toeplitz_column.detach_() return BatchRepeatLazyTensor(ToeplitzLazyTensor(toeplitz_column), torch.Size((3,)))
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, **kwargs: Any, ) -> GPyTorchPosterior: self.eval() # make sure we're calling a posterior # input transforms are applied at `posterior` in `eval` mode, and at # `model.forward()` at the training time X = self.transform_inputs(X) no_pred_variance = skip_posterior_variances._state with ExitStack() as es: es.enter_context(gpt_posterior_settings()) es.enter_context(fast_pred_var(True)) # we need to skip posterior variances here es.enter_context(skip_posterior_variances(True)) mvn = self(X) if observation_noise is not False: # TODO: ensure that this still works for structured noise solves. mvn = self.likelihood(mvn, X) # lazy covariance matrix includes the interpolated version of the full # covariance matrix so we can actually grab that instead. if X.ndimension() > self.train_inputs[0].ndimension(): X_batch_shape = X.shape[:-2] train_inputs = self.train_inputs[0].reshape( *[1] * len(X_batch_shape), *self.train_inputs[0].shape ) train_inputs = train_inputs.repeat( *X_batch_shape, *[1] * self.train_inputs[0].ndimension() ) else: train_inputs = self.train_inputs[0] # we now compute the data covariances for the training data, the testing # data, the joint covariances, and the test train cross-covariance train_train_covar = self.prediction_strategy.lik_train_train_covar.detach() base_train_train_covar = train_train_covar.lazy_tensor data_train_covar = base_train_train_covar.lazy_tensors[0] data_covar = self.covar_modules[0] data_train_test_covar = data_covar(X, train_inputs) data_test_test_covar = data_covar(X) data_joint_covar = data_train_covar.cat_rows( cross_mat=data_train_test_covar, new_mat=data_test_test_covar, ) # we detach the latents so that they don't cause gradient errors # TODO: Can we enable backprop through the latent covariances? batch_shape = data_train_test_covar.batch_shape latent_covar_list = [] for latent_covar in base_train_train_covar.lazy_tensors[1:]: if latent_covar.batch_shape != batch_shape: latent_covar = BatchRepeatLazyTensor(latent_covar, batch_shape) latent_covar_list.append(latent_covar.detach()) joint_covar = KroneckerProductLazyTensor( data_joint_covar, *latent_covar_list ) test_train_covar = KroneckerProductLazyTensor( data_train_test_covar, *latent_covar_list ) # compute the posterior variance if necessary if no_pred_variance: pred_variance = mvn.variance else: pred_variance = self.make_posterior_variances(joint_covar) # mean and variance get reshaped into the target shape new_mean = mvn.mean.reshape(*X.shape[:-1], *self.target_shape) if not no_pred_variance: new_variance = pred_variance.reshape(*X.shape[:-1], *self.target_shape) new_variance = DiagLazyTensor(new_variance) else: new_variance = ZeroLazyTensor( *X.shape[:-1], *self.target_shape, self.target_shape[-1] ) mvn = MultivariateNormal(new_mean, new_variance) # return a specialized Posterior to allow for sampling # cloning the full covar allows backpropagation through it posterior = HigherOrderGPPosterior( mvn=mvn, train_targets=self.train_targets.unsqueeze(-1), train_train_covar=train_train_covar, test_train_covar=test_train_covar, joint_covariance_matrix=joint_covar.clone(), output_shape=X.shape[:-1] + self.target_shape, num_outputs=self._num_outputs, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior
def create_lazy_tensor(self): rand_mat = torch.randn(25, 12, dtype=torch.float) rand_mat.detach_() return BatchRepeatLazyTensor(lazify(rand_mat), torch.Size((10, )))