def _get_covariance(self, x1, x2): k_ux1 = delazify(self.base_kernel(x1, self.inducing_points)) if torch.equal(x1, x2): covar = RootLazyTensor(k_ux1.matmul(self._inducing_inv_root)) # Diagonal correction for predictive posterior correction = (self.base_kernel(x1, x2, diag=True) - covar.diag()).clamp(0, math.inf) covar = PsdSumLazyTensor(covar, DiagLazyTensor(correction)) else: k_ux2 = delazify(self.base_kernel(x2, self.inducing_points)) covar = MatmulLazyTensor( k_ux1.matmul(self._inducing_inv_root), k_ux2.matmul(self._inducing_inv_root).transpose(-1, -2)) return covar
def _covar_diag(self, inputs): if inputs.ndimension() == 1: inputs = inputs.unsqueeze(1) # Get diagonal of covar covar_diag = delazify(self.base_kernel(inputs, diag=True)) return DiagLazyTensor(covar_diag)
def _inducing_mat(self): if not self.training and hasattr(self, "_cached_kernel_mat"): return self._cached_kernel_mat else: res = delazify( self.base_kernel(self.inducing_points, self.inducing_points)) if not self.training: self._cached_kernel_mat = res return res
def forward(self, x1, x2, last_dim_is_batch=False, diag=False, **params): orig_output = self.base_kernel.forward( x1, x2, diag=diag, last_dim_is_batch=last_dim_is_batch, **params) outputscales = self.outputscale if last_dim_is_batch: outputscales = outputscales.unsqueeze(-1) if diag: outputscales = outputscales.unsqueeze(-1) return delazify(orig_output) * outputscales else: outputscales = outputscales.view(*outputscales.shape, 1, 1) return orig_output.mul(outputscales)
def forward(self, x1, x2, diag=False, **params): x1_eq_x2 = torch.equal(x1, x2) if not x1_eq_x2: # If x1 != x2, then we can't make a MulLazyTensor because the kernel won't necessarily be square/symmetric res = delazify(self.kernels[0](x1, x2, diag=diag, **params)) else: res = self.kernels[0](x1, x2, diag=diag, **params) if not diag: res = lazify(res) for kern in self.kernels[1:]: next_term = kern(x1, x2, diag=diag, **params) if not x1_eq_x2: # Again delazify if x1 != x2 res = res * delazify(next_term) else: if not diag: res = res * lazify(next_term) else: res = res * next_term return res
def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params): """Forward proceeds by Newton-Girard formulae""" if last_dim_is_batch: raise RuntimeError( "NewtonGirardAdditiveKernel does not accept the last_dim_is_batch argument." ) # NOTE: comments about shape are only correct for the single-batch cases. # kern_values is just the order-1 terms # kern_values = D x n x n unless diag=True kern_values = delazify( self.base_kernel(x1, x2, diag=diag, last_dim_is_batch=True, **params)) # last dim is batch, which gets moved up to pos. 1 kernel_dim = -3 if not diag else -2 shape = [1 for _ in range(len(kern_values.shape) + 1)] shape[kernel_dim - 1] = -1 kvals = torch.arange(1, self.max_degree + 1, device=kern_values.device).reshape(*shape) # kvals = R x 1 x 1 x 1 (these are indexes only) # e_n = torch.ones(self.max_degree+1, *kern_values.shape[1:], device=kern_values.device) # includes 0 # e_n: elementary symmetric polynomial of degree n (e.g. z1 z2 + z1 z3 + z2 z3) # e_n is R x n x n, and the array is properly 0 indexed. shape = [d_ for d_ in kern_values.shape] shape[kernel_dim] = self.max_degree + 1 e_n = torch.empty(*shape, device=kern_values.device) if kernel_dim == -3: e_n[..., 0, :, :] = 1.0 else: e_n[..., 0, :] = 1.0 # power sums s_k (e.g. sum_i^num_dims z_i^k # s_k is R x n x n s_k = kern_values.unsqueeze(kernel_dim - 1).pow(kvals).sum(dim=kernel_dim) # just the constant -1 m1 = torch.tensor([-1], dtype=torch.float, device=kern_values.device) shape = [1 for _ in range(len(kern_values.shape))] shape[kernel_dim] = -1 for deg in range(1, self.max_degree + 1): # deg goes from 1 to R (it's 1-indexed!) # we avg over k [1, ..., deg] (-1)^(k-1)e_{deg-k} s_{k} ks = torch.arange(1, deg + 1, device=kern_values.device, dtype=torch.float).reshape(*shape) # use for pow kslong = torch.arange(1, deg + 1, device=kern_values.device, dtype=torch.long) # use for indexing # note that s_k is 0-indexed, so we must subtract 1 from kslong sum_ = (m1.pow(ks - 1) * e_n.index_select(kernel_dim, deg - kslong) * s_k.index_select(kernel_dim, kslong - 1)).sum( dim=kernel_dim) / deg if kernel_dim == -3: e_n[..., deg, :, :] = sum_ else: e_n[..., deg, :] = sum_ if kernel_dim == -3: return (self.outputscale.unsqueeze(-1).unsqueeze(-1) * e_n.narrow(kernel_dim, 1, self.max_degree)).sum( dim=kernel_dim) else: return (self.outputscale.unsqueeze(-1) * e_n.narrow(kernel_dim, 1, self.max_degree)).sum( dim=kernel_dim)
def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params): if last_dim_is_batch and not self.interpolation_mode: raise ValueError("last_dim_is_batch is only valid with interpolation model") grid = self.grid if self.is_ragged: # Pad the grid - so that grid is the same size for each dimension max_grid_size = max(proj.size(-1) for proj in grid) padded_grid = [] for proj in grid: padding_size = max_grid_size - proj.size(-1) if padding_size > 0: dtype = proj.dtype device = proj.device padded_grid.append( torch.cat([proj, torch.zeros(*proj.shape[:-1], padding_size, dtype=dtype, device=device)]) ) else: padded_grid.append(proj) else: padded_grid = grid if not self.interpolation_mode: if len(x1.shape[:-2]): full_grid = self.full_grid.expand(*x1.shape[:-2], *self.full_grid.shape[-2:]) else: full_grid = self.full_grid if self.interpolation_mode or (torch.equal(x1, full_grid) and torch.equal(x2, full_grid)): if not self.training and hasattr(self, "_cached_kernel_mat"): return self._cached_kernel_mat # Can exploit Toeplitz structure if grid points in each dimension are equally # spaced and using a translation-invariant kernel if settings.use_toeplitz.on(): # Use padded grid for batch mode first_grid_point = torch.stack([proj[0].unsqueeze(0) for proj in grid], dim=-1) full_grid = torch.stack(padded_grid, dim=-1) covars = delazify(self.base_kernel(first_grid_point, full_grid, last_dim_is_batch=True, **params)) if last_dim_is_batch: # Toeplitz expects batches of columns so we concatenate the # 1 x grid_size[i] tensors together # Note that this requires all the dimensions to have the same number of grid points covar = ToeplitzLazyTensor(covars.squeeze(-2)) else: # Non-batched ToeplitzLazyTensor expects a 1D tensor, so we squeeze out the row dimension covars = covars.squeeze(-2) # Get rid of the dimension corresponding to the first point # Un-pad the grid covars = [ToeplitzLazyTensor(covars[..., i, : proj.size(-1)]) for i, proj in enumerate(grid)] # Due to legacy reasons, KroneckerProductLazyTensor(A, B, C) is actually (C Kron B Kron A) covar = KroneckerProductLazyTensor(*covars[::-1]) else: full_grid = torch.stack(padded_grid, dim=-1) covars = delazify(self.base_kernel(full_grid, full_grid, last_dim_is_batch=True, **params)) if last_dim_is_batch: # Note that this requires all the dimensions to have the same number of grid points covar = covars else: covars = [covars[..., i, : proj.size(-1), : proj.size(-1)] for i, proj in enumerate(self.grid)] covar = KroneckerProductLazyTensor(*covars[::-1]) if not self.training: self._cached_kernel_mat = covar return covar else: return self.base_kernel.forward(x1, x2, diag=diag, last_dim_is_batch=last_dim_is_batch, **params)
def _cholesky_factor(self, induc_induc_covar): L = psd_safe_cholesky(delazify(induc_induc_covar).double()) return L
def exact_predictive_covar(self, test_test_covar, test_train_covar): """ Computes the posterior predictive covariance of a GP Args: test_train_covar (:obj:`gpytorch.lazy.LazyTensor`): Covariance matrix between test and train inputs test_test_covar (:obj:`gpytorch.lazy.LazyTensor`): Covariance matrix between test inputs Returns: :obj:`gpytorch.lazy.LazyTensor`: A LazyTensor representing the predictive posterior covariance of the test points """ if settings.fast_pred_var.on(): self._last_test_train_covar = test_train_covar if settings.skip_posterior_variances.on(): return ZeroLazyTensor(*test_test_covar.size()) if settings.fast_pred_var.off(): dist = self.train_prior_dist.__class__( torch.zeros_like(self.train_prior_dist.mean), self.train_prior_dist.lazy_covariance_matrix) if settings.detach_test_caches.on(): train_train_covar = self.likelihood( dist, self.train_inputs).lazy_covariance_matrix.detach() else: train_train_covar = self.likelihood( dist, self.train_inputs).lazy_covariance_matrix test_train_covar = delazify(test_train_covar) train_test_covar = test_train_covar.transpose(-1, -2) covar_correction_rhs = train_train_covar.inv_matmul( train_test_covar) # For efficiency if torch.is_tensor(test_test_covar): # We can use addmm in the 2d case if test_test_covar.dim() == 2: return lazify( torch.addmm(test_test_covar, test_train_covar, covar_correction_rhs, beta=1, alpha=-1)) else: return lazify( test_test_covar + test_train_covar @ covar_correction_rhs.mul(-1)) # In other cases - we'll use the standard infrastructure else: return test_test_covar + MatmulLazyTensor( test_train_covar, covar_correction_rhs.mul(-1)) precomputed_cache = self.covar_cache covar_inv_quad_form_root = self._exact_predictive_covar_inv_quad_form_root( precomputed_cache, test_train_covar) if torch.is_tensor(test_test_covar): return lazify( torch.add(test_test_covar, covar_inv_quad_form_root @ covar_inv_quad_form_root.transpose(-1, -2), alpha=-1)) else: return test_test_covar + MatmulLazyTensor( covar_inv_quad_form_root, covar_inv_quad_form_root.transpose(-1, -2).mul(-1))
def covar_cache(self): train_train_covar = self.lik_train_train_covar train_train_covar_inv_root = delazify( train_train_covar.root_inv_decomposition().root) return self._exact_predictive_covar_inv_quad_form_cache( train_train_covar_inv_root, self._last_test_train_covar)
def get_fantasy_strategy(self, inputs, targets, full_inputs, full_targets, full_output, **kwargs): """ Returns a new PredictionStrategy that incorporates the specified inputs and targets as new training data. This method is primary responsible for updating the mean and covariance caches. To add fantasy data to a GP model, use the :meth:`~gpytorch.models.ExactGP.get_fantasy_model` method. Args: - :attr:`inputs` (Tensor `b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`): Locations of fantasy observations. - :attr:`targets` (Tensor `b1 x ... x bk x m` or `f x b1 x ... x bk x m`): Labels of fantasy observations. - :attr:`full_inputs` (Tensor `b1 x ... x bk x n+m x d` or `f x b1 x ... x bk x n+m x d`): Training data concatenated with fantasy inputs - :attr:`full_targets` (Tensor `b1 x ... x bk x n+m` or `f x b1 x ... x bk x n+m`): Training labels concatenated with fantasy labels. - :attr:`full_output` (:class:`gpytorch.distributions.MultivariateNormal`): Prior called on full_inputs Returns: - :class:`DefaultPredictionStrategy` A `DefaultPredictionStrategy` model with `n + m` training examples, where the `m` fantasy examples have been added and all test-time caches have been updated. """ full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix batch_shape = full_inputs[0].shape[:-2] full_mean = full_mean.view(*batch_shape, -1) num_train = self.num_train # Evaluate fant x train and fant x fant covariance matrices, leave train x train unevaluated. fant_fant_covar = full_covar[..., num_train:, num_train:] fant_mean = full_mean[..., num_train:] mvn = self.train_prior_dist.__class__(fant_mean, fant_fant_covar) fant_likelihood = self.likelihood.get_fantasy_likelihood(**kwargs) mvn_obs = fant_likelihood(mvn, inputs, **kwargs) fant_fant_covar = mvn_obs.covariance_matrix fant_train_covar = delazify(full_covar[..., num_train:, :num_train]) self.fantasy_inputs = inputs self.fantasy_targets = targets r""" Compute a new mean cache given the old mean cache. We have \alpha = K^{-1}y, and we want to solve [K U; U' S][a; b] = [y; y_f], where U' is fant_train_covar, S is fant_fant_covar, and y_f is (targets - fant_mean) To do this, we solve the bordered linear system of equations for [a; b]: AQ = U # Q = fant_solve [S - U'Q]b = y_f - U'\alpha ==> b = [S - U'Q]^{-1}(y_f - U'\alpha) a = \alpha - Qb """ # Get cached K inverse decomp. (or compute if we somehow don't already have the covariance cache) K_inverse = self.lik_train_train_covar.root_inv_decomposition() fant_solve = K_inverse.matmul(fant_train_covar.transpose(-2, -1)) # Solve for "b", the lower portion of the *new* \\alpha corresponding to the fantasy points. schur_complement = fant_fant_covar - fant_train_covar.matmul( fant_solve) # we'd like to use a less hacky approach for the following, but einsum can be much faster than # than unsqueezing/squeezing here (esp. in backward passes), unfortunately it currenlty has some # issues with broadcasting: https://github.com/pytorch/pytorch/issues/15671 prefix = string.ascii_lowercase[:max( fant_train_covar.dim() - self.mean_cache.dim() - 1, 0)] ftcm = torch.einsum(prefix + "...yz,...z->" + prefix + "...y", [fant_train_covar, self.mean_cache]) small_system_rhs = targets - fant_mean - ftcm small_system_rhs = small_system_rhs.unsqueeze(-1) # Schur complement of a spd matrix is guaranteed to be positive definite schur_cholesky = psd_safe_cholesky( schur_complement, jitter=settings.cholesky_jitter.value()) fant_cache_lower = torch.cholesky_solve(small_system_rhs, schur_cholesky) # Get "a", the new upper portion of the cache corresponding to the old training points. fant_cache_upper = self.mean_cache.unsqueeze(-1) - fant_solve.matmul( fant_cache_lower) fant_cache_upper = fant_cache_upper.squeeze(-1) fant_cache_lower = fant_cache_lower.squeeze(-1) # New mean cache. fant_mean_cache = torch.cat((fant_cache_upper, fant_cache_lower), dim=-1) """ Compute a new covariance cache given the old covariance cache. We have access to K \\approx LL' and K^{-1} \\approx R^{-1}R^{-T}, where L and R are low rank matrices resulting from Lanczos (see the LOVE paper). To update R^{-1}, we first update L: [K U; U' S] = [L 0; A B][L' A'; 0 B'] Solving this matrix equation, we get: K = LL' ==> L = L U = LA' ==> A = UR^{-1} S = AA' + BB' ==> B = cholesky(S - AA') Once we've computed Z = [L 0; A B], we have that the new kernel matrix [K U; U' S] \approx ZZ'. Therefore, we can form a pseudo-inverse of Z directly to approximate [K U; U' S]^{-1/2}. """ # [K U; U' S] = [L 0; lower_left schur_root] batch_shape = fant_train_covar.shape[:-2] L_inverse = self.covar_cache L = self.lik_train_train_covar.root_decomposition().root m, n = L.shape[-2:] lower_left = fant_train_covar.matmul(L_inverse) schur = fant_fant_covar - lower_left.matmul( lower_left.transpose(-2, -1)) schur_root = psd_safe_cholesky(schur, jitter=settings.cholesky_jitter.value()) # Form new root Z = [L 0; lower_left schur_root] # # TODO: Special case triangular case once #1102 goes in # if isinstance(L, TriangularLazyTensor): # # The whole thing is triangular, we can just do two triangular solves # ... # else: L = delazify(L) num_fant = schur_root.size(-2) new_root = torch.zeros(*batch_shape, m + num_fant, n + num_fant, device=L.device, dtype=L.dtype) new_root[..., :m, :n] = L new_root[..., m:, :n] = lower_left new_root[..., m:, n:] = schur_root # Use pseudo-inverse of Z as new inv root if new_root.shape[-1] <= 2048: # Dispatch to CPU so long as pytorch/pytorch#22573 is not fixed device = new_root.device Q, R = torch.qr(new_root.cpu()) Q = Q.to(device) R = R.to(device) else: Q, R = torch.qr(new_root) Rdiag = torch.diagonal(R, dim1=-2, dim2=-1) # if R is almost singular, add jitter zeroish = Rdiag.abs() < 1e-6 if torch.any(zeroish): # can't use in-place operation here b/c it would mess up backward pass # haven't found a more elegant way to add a jitter diagonal yet... jitter_diag = 1e-6 * torch.sign(Rdiag) * zeroish.to(Rdiag) R = R + torch.diag_embed(jitter_diag) new_covar_cache = torch.triangular_solve(Q.transpose(-2, -1), R)[0].transpose(-2, -1) # Expand inputs accordingly if necessary (for fantasies at the same points) if full_inputs[0].dim() <= full_targets.dim(): fant_batch_shape = full_targets.shape[:1] n_batch = len(full_mean.shape[:-1]) repeat_shape = fant_batch_shape + torch.Size([1] * n_batch) full_inputs = [ fi.expand(fant_batch_shape + fi.shape) for fi in full_inputs ] full_mean = full_mean.expand(fant_batch_shape + full_mean.shape) full_covar = BatchRepeatLazyTensor(full_covar, repeat_shape) new_root = BatchRepeatLazyTensor(NonLazyTensor(new_root), repeat_shape) # no need to repeat the covar cache, broadcasting will do the right thing # Create new DefaultPredictionStrategy object fant_strat = self.__class__( train_inputs=full_inputs, train_prior_dist=self.train_prior_dist.__class__( full_mean, full_covar), train_labels=full_targets, likelihood=fant_likelihood, root=new_root, inv_root=new_covar_cache, ) add_to_cache(fant_strat, "mean_cache", fant_mean_cache) add_to_cache(fant_strat, "covar_cache", new_covar_cache) return fant_strat
def _cholesky_factor(self, induc_induc_covar): # Maybe used - if we're not using CG L = psd_safe_cholesky(delazify(induc_induc_covar)) return L