def _get_covariance(self, x1, x2):
        k_ux1 = delazify(self.base_kernel(x1, self.inducing_points))
        if torch.equal(x1, x2):
            covar = RootLazyTensor(k_ux1.matmul(self._inducing_inv_root))

            # Diagonal correction for predictive posterior
            correction = (self.base_kernel(x1, x2, diag=True) -
                          covar.diag()).clamp(0, math.inf)
            covar = PsdSumLazyTensor(covar, DiagLazyTensor(correction))
        else:
            k_ux2 = delazify(self.base_kernel(x2, self.inducing_points))
            covar = MatmulLazyTensor(
                k_ux1.matmul(self._inducing_inv_root),
                k_ux2.matmul(self._inducing_inv_root).transpose(-1, -2))

        return covar
    def _covar_diag(self, inputs):
        if inputs.ndimension() == 1:
            inputs = inputs.unsqueeze(1)

        # Get diagonal of covar
        covar_diag = delazify(self.base_kernel(inputs, diag=True))
        return DiagLazyTensor(covar_diag)
 def _inducing_mat(self):
     if not self.training and hasattr(self, "_cached_kernel_mat"):
         return self._cached_kernel_mat
     else:
         res = delazify(
             self.base_kernel(self.inducing_points, self.inducing_points))
         if not self.training:
             self._cached_kernel_mat = res
         return res
Exemple #4
0
 def forward(self, x1, x2, last_dim_is_batch=False, diag=False, **params):
     orig_output = self.base_kernel.forward(
         x1, x2, diag=diag, last_dim_is_batch=last_dim_is_batch, **params)
     outputscales = self.outputscale
     if last_dim_is_batch:
         outputscales = outputscales.unsqueeze(-1)
     if diag:
         outputscales = outputscales.unsqueeze(-1)
         return delazify(orig_output) * outputscales
     else:
         outputscales = outputscales.view(*outputscales.shape, 1, 1)
         return orig_output.mul(outputscales)
    def forward(self, x1, x2, diag=False, **params):
        x1_eq_x2 = torch.equal(x1, x2)

        if not x1_eq_x2:
            # If x1 != x2, then we can't make a MulLazyTensor because the kernel won't necessarily be square/symmetric
            res = delazify(self.kernels[0](x1, x2, diag=diag, **params))
        else:
            res = self.kernels[0](x1, x2, diag=diag, **params)

            if not diag:
                res = lazify(res)

        for kern in self.kernels[1:]:
            next_term = kern(x1, x2, diag=diag, **params)
            if not x1_eq_x2:
                # Again delazify if x1 != x2
                res = res * delazify(next_term)
            else:
                if not diag:
                    res = res * lazify(next_term)
                else:
                    res = res * next_term

        return res
    def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
        """Forward proceeds by Newton-Girard formulae"""
        if last_dim_is_batch:
            raise RuntimeError(
                "NewtonGirardAdditiveKernel does not accept the last_dim_is_batch argument."
            )

        # NOTE: comments about shape are only correct for the single-batch cases.
        # kern_values is just the order-1 terms
        # kern_values = D x n x n unless diag=True
        kern_values = delazify(
            self.base_kernel(x1,
                             x2,
                             diag=diag,
                             last_dim_is_batch=True,
                             **params))
        # last dim is batch, which gets moved up to pos. 1

        kernel_dim = -3 if not diag else -2

        shape = [1 for _ in range(len(kern_values.shape) + 1)]
        shape[kernel_dim - 1] = -1
        kvals = torch.arange(1, self.max_degree + 1,
                             device=kern_values.device).reshape(*shape)
        # kvals = R x 1 x 1 x 1 (these are indexes only)

        # e_n = torch.ones(self.max_degree+1, *kern_values.shape[1:], device=kern_values.device)  # includes 0
        # e_n: elementary symmetric polynomial of degree n (e.g. z1 z2 + z1 z3 + z2 z3)
        # e_n is R x n x n, and the array is properly 0 indexed.
        shape = [d_ for d_ in kern_values.shape]
        shape[kernel_dim] = self.max_degree + 1
        e_n = torch.empty(*shape, device=kern_values.device)
        if kernel_dim == -3:
            e_n[..., 0, :, :] = 1.0
        else:
            e_n[..., 0, :] = 1.0

        # power sums s_k (e.g. sum_i^num_dims z_i^k
        # s_k is R x n x n
        s_k = kern_values.unsqueeze(kernel_dim -
                                    1).pow(kvals).sum(dim=kernel_dim)

        # just the constant -1
        m1 = torch.tensor([-1], dtype=torch.float, device=kern_values.device)

        shape = [1 for _ in range(len(kern_values.shape))]
        shape[kernel_dim] = -1
        for deg in range(1, self.max_degree +
                         1):  # deg goes from 1 to R (it's 1-indexed!)
            # we avg over k [1, ..., deg] (-1)^(k-1)e_{deg-k} s_{k}

            ks = torch.arange(1,
                              deg + 1,
                              device=kern_values.device,
                              dtype=torch.float).reshape(*shape)  # use for pow
            kslong = torch.arange(1,
                                  deg + 1,
                                  device=kern_values.device,
                                  dtype=torch.long)  # use for indexing

            # note that s_k is 0-indexed, so we must subtract 1 from kslong
            sum_ = (m1.pow(ks - 1) * e_n.index_select(kernel_dim, deg - kslong)
                    * s_k.index_select(kernel_dim, kslong - 1)).sum(
                        dim=kernel_dim) / deg
            if kernel_dim == -3:
                e_n[..., deg, :, :] = sum_
            else:
                e_n[..., deg, :] = sum_

        if kernel_dim == -3:
            return (self.outputscale.unsqueeze(-1).unsqueeze(-1) *
                    e_n.narrow(kernel_dim, 1, self.max_degree)).sum(
                        dim=kernel_dim)
        else:
            return (self.outputscale.unsqueeze(-1) *
                    e_n.narrow(kernel_dim, 1, self.max_degree)).sum(
                        dim=kernel_dim)
    def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
        if last_dim_is_batch and not self.interpolation_mode:
            raise ValueError("last_dim_is_batch is only valid with interpolation model")

        grid = self.grid
        if self.is_ragged:
            # Pad the grid - so that grid is the same size for each dimension
            max_grid_size = max(proj.size(-1) for proj in grid)
            padded_grid = []
            for proj in grid:
                padding_size = max_grid_size - proj.size(-1)
                if padding_size > 0:
                    dtype = proj.dtype
                    device = proj.device
                    padded_grid.append(
                        torch.cat([proj, torch.zeros(*proj.shape[:-1], padding_size, dtype=dtype, device=device)])
                    )
                else:
                    padded_grid.append(proj)
        else:
            padded_grid = grid

        if not self.interpolation_mode:
            if len(x1.shape[:-2]):
                full_grid = self.full_grid.expand(*x1.shape[:-2], *self.full_grid.shape[-2:])
            else:
                full_grid = self.full_grid

        if self.interpolation_mode or (torch.equal(x1, full_grid) and torch.equal(x2, full_grid)):
            if not self.training and hasattr(self, "_cached_kernel_mat"):
                return self._cached_kernel_mat
            # Can exploit Toeplitz structure if grid points in each dimension are equally
            # spaced and using a translation-invariant kernel
            if settings.use_toeplitz.on():
                # Use padded grid for batch mode
                first_grid_point = torch.stack([proj[0].unsqueeze(0) for proj in grid], dim=-1)
                full_grid = torch.stack(padded_grid, dim=-1)
                covars = delazify(self.base_kernel(first_grid_point, full_grid, last_dim_is_batch=True, **params))

                if last_dim_is_batch:
                    # Toeplitz expects batches of columns so we concatenate the
                    # 1 x grid_size[i] tensors together
                    # Note that this requires all the dimensions to have the same number of grid points
                    covar = ToeplitzLazyTensor(covars.squeeze(-2))
                else:
                    # Non-batched ToeplitzLazyTensor expects a 1D tensor, so we squeeze out the row dimension
                    covars = covars.squeeze(-2)  # Get rid of the dimension corresponding to the first point
                    # Un-pad the grid
                    covars = [ToeplitzLazyTensor(covars[..., i, : proj.size(-1)]) for i, proj in enumerate(grid)]
                    # Due to legacy reasons, KroneckerProductLazyTensor(A, B, C) is actually (C Kron B Kron A)
                    covar = KroneckerProductLazyTensor(*covars[::-1])
            else:
                full_grid = torch.stack(padded_grid, dim=-1)
                covars = delazify(self.base_kernel(full_grid, full_grid, last_dim_is_batch=True, **params))
                if last_dim_is_batch:
                    # Note that this requires all the dimensions to have the same number of grid points
                    covar = covars
                else:
                    covars = [covars[..., i, : proj.size(-1), : proj.size(-1)] for i, proj in enumerate(self.grid)]
                    covar = KroneckerProductLazyTensor(*covars[::-1])

            if not self.training:
                self._cached_kernel_mat = covar

            return covar
        else:
            return self.base_kernel.forward(x1, x2, diag=diag, last_dim_is_batch=last_dim_is_batch, **params)
 def _cholesky_factor(self, induc_induc_covar):
     L = psd_safe_cholesky(delazify(induc_induc_covar).double())
     return L
Exemple #9
0
    def exact_predictive_covar(self, test_test_covar, test_train_covar):
        """
        Computes the posterior predictive covariance of a GP

        Args:
            test_train_covar (:obj:`gpytorch.lazy.LazyTensor`): Covariance matrix between test and train inputs
            test_test_covar (:obj:`gpytorch.lazy.LazyTensor`): Covariance matrix between test inputs

        Returns:
            :obj:`gpytorch.lazy.LazyTensor`: A LazyTensor representing the predictive posterior covariance of the
                                               test points
        """
        if settings.fast_pred_var.on():
            self._last_test_train_covar = test_train_covar

        if settings.skip_posterior_variances.on():
            return ZeroLazyTensor(*test_test_covar.size())

        if settings.fast_pred_var.off():
            dist = self.train_prior_dist.__class__(
                torch.zeros_like(self.train_prior_dist.mean),
                self.train_prior_dist.lazy_covariance_matrix)
            if settings.detach_test_caches.on():
                train_train_covar = self.likelihood(
                    dist, self.train_inputs).lazy_covariance_matrix.detach()
            else:
                train_train_covar = self.likelihood(
                    dist, self.train_inputs).lazy_covariance_matrix

            test_train_covar = delazify(test_train_covar)
            train_test_covar = test_train_covar.transpose(-1, -2)
            covar_correction_rhs = train_train_covar.inv_matmul(
                train_test_covar)
            # For efficiency
            if torch.is_tensor(test_test_covar):
                # We can use addmm in the 2d case
                if test_test_covar.dim() == 2:
                    return lazify(
                        torch.addmm(test_test_covar,
                                    test_train_covar,
                                    covar_correction_rhs,
                                    beta=1,
                                    alpha=-1))
                else:
                    return lazify(
                        test_test_covar +
                        test_train_covar @ covar_correction_rhs.mul(-1))
            # In other cases - we'll use the standard infrastructure
            else:
                return test_test_covar + MatmulLazyTensor(
                    test_train_covar, covar_correction_rhs.mul(-1))

        precomputed_cache = self.covar_cache
        covar_inv_quad_form_root = self._exact_predictive_covar_inv_quad_form_root(
            precomputed_cache, test_train_covar)
        if torch.is_tensor(test_test_covar):
            return lazify(
                torch.add(test_test_covar,
                          covar_inv_quad_form_root
                          @ covar_inv_quad_form_root.transpose(-1, -2),
                          alpha=-1))
        else:
            return test_test_covar + MatmulLazyTensor(
                covar_inv_quad_form_root,
                covar_inv_quad_form_root.transpose(-1, -2).mul(-1))
Exemple #10
0
 def covar_cache(self):
     train_train_covar = self.lik_train_train_covar
     train_train_covar_inv_root = delazify(
         train_train_covar.root_inv_decomposition().root)
     return self._exact_predictive_covar_inv_quad_form_cache(
         train_train_covar_inv_root, self._last_test_train_covar)
Exemple #11
0
    def get_fantasy_strategy(self, inputs, targets, full_inputs, full_targets,
                             full_output, **kwargs):
        """
        Returns a new PredictionStrategy that incorporates the specified inputs and targets as new training data.

        This method is primary responsible for updating the mean and covariance caches. To add fantasy data to a
        GP model, use the :meth:`~gpytorch.models.ExactGP.get_fantasy_model` method.

        Args:
            - :attr:`inputs` (Tensor `b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`): Locations of fantasy
                observations.
            - :attr:`targets` (Tensor `b1 x ... x bk x m` or `f x b1 x ... x bk x m`): Labels of fantasy observations.
            - :attr:`full_inputs` (Tensor `b1 x ... x bk x n+m x d` or `f x b1 x ... x bk x n+m x d`): Training data
                concatenated with fantasy inputs
            - :attr:`full_targets` (Tensor `b1 x ... x bk x n+m` or `f x b1 x ... x bk x n+m`): Training labels
                concatenated with fantasy labels.
            - :attr:`full_output` (:class:`gpytorch.distributions.MultivariateNormal`): Prior called on full_inputs

        Returns:
            - :class:`DefaultPredictionStrategy`
                A `DefaultPredictionStrategy` model with `n + m` training examples, where the `m` fantasy examples have
                been added and all test-time caches have been updated.
        """
        full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix

        batch_shape = full_inputs[0].shape[:-2]

        full_mean = full_mean.view(*batch_shape, -1)
        num_train = self.num_train

        # Evaluate fant x train and fant x fant covariance matrices, leave train x train unevaluated.
        fant_fant_covar = full_covar[..., num_train:, num_train:]
        fant_mean = full_mean[..., num_train:]
        mvn = self.train_prior_dist.__class__(fant_mean, fant_fant_covar)
        fant_likelihood = self.likelihood.get_fantasy_likelihood(**kwargs)
        mvn_obs = fant_likelihood(mvn, inputs, **kwargs)

        fant_fant_covar = mvn_obs.covariance_matrix
        fant_train_covar = delazify(full_covar[..., num_train:, :num_train])

        self.fantasy_inputs = inputs
        self.fantasy_targets = targets
        r"""
        Compute a new mean cache given the old mean cache.

        We have \alpha = K^{-1}y, and we want to solve [K U; U' S][a; b] = [y; y_f], where U' is fant_train_covar,
        S is fant_fant_covar, and y_f is (targets - fant_mean)

        To do this, we solve the bordered linear system of equations for [a; b]:
            AQ = U  # Q = fant_solve
            [S - U'Q]b = y_f - U'\alpha   ==> b = [S - U'Q]^{-1}(y_f - U'\alpha)
            a = \alpha - Qb
        """
        # Get cached K inverse decomp. (or compute if we somehow don't already have the covariance cache)
        K_inverse = self.lik_train_train_covar.root_inv_decomposition()
        fant_solve = K_inverse.matmul(fant_train_covar.transpose(-2, -1))

        # Solve for "b", the lower portion of the *new* \\alpha corresponding to the fantasy points.
        schur_complement = fant_fant_covar - fant_train_covar.matmul(
            fant_solve)

        # we'd like to use a less hacky approach for the following, but einsum can be much faster than
        # than unsqueezing/squeezing here (esp. in backward passes), unfortunately it currenlty has some
        # issues with broadcasting: https://github.com/pytorch/pytorch/issues/15671
        prefix = string.ascii_lowercase[:max(
            fant_train_covar.dim() - self.mean_cache.dim() - 1, 0)]
        ftcm = torch.einsum(prefix + "...yz,...z->" + prefix + "...y",
                            [fant_train_covar, self.mean_cache])

        small_system_rhs = targets - fant_mean - ftcm
        small_system_rhs = small_system_rhs.unsqueeze(-1)
        # Schur complement of a spd matrix is guaranteed to be positive definite
        schur_cholesky = psd_safe_cholesky(
            schur_complement, jitter=settings.cholesky_jitter.value())
        fant_cache_lower = torch.cholesky_solve(small_system_rhs,
                                                schur_cholesky)

        # Get "a", the new upper portion of the cache corresponding to the old training points.
        fant_cache_upper = self.mean_cache.unsqueeze(-1) - fant_solve.matmul(
            fant_cache_lower)

        fant_cache_upper = fant_cache_upper.squeeze(-1)
        fant_cache_lower = fant_cache_lower.squeeze(-1)

        # New mean cache.
        fant_mean_cache = torch.cat((fant_cache_upper, fant_cache_lower),
                                    dim=-1)
        """
        Compute a new covariance cache given the old covariance cache.

        We have access to K \\approx LL' and K^{-1} \\approx R^{-1}R^{-T}, where L and R are low rank matrices
        resulting from Lanczos (see the LOVE paper).

        To update R^{-1}, we first update L:
            [K U; U' S] = [L 0; A B][L' A'; 0 B']
        Solving this matrix equation, we get:
            K = LL' ==>       L = L
            U = LA' ==>       A = UR^{-1}
            S = AA' + BB' ==> B = cholesky(S - AA')

        Once we've computed Z = [L 0; A B], we have that the new kernel matrix [K U; U' S] \approx ZZ'. Therefore,
        we can form a pseudo-inverse of Z directly to approximate [K U; U' S]^{-1/2}.
        """
        # [K U; U' S] = [L 0; lower_left schur_root]
        batch_shape = fant_train_covar.shape[:-2]

        L_inverse = self.covar_cache
        L = self.lik_train_train_covar.root_decomposition().root
        m, n = L.shape[-2:]

        lower_left = fant_train_covar.matmul(L_inverse)
        schur = fant_fant_covar - lower_left.matmul(
            lower_left.transpose(-2, -1))
        schur_root = psd_safe_cholesky(schur,
                                       jitter=settings.cholesky_jitter.value())

        # Form new root Z = [L 0; lower_left schur_root]

        # # TODO: Special case triangular case once #1102 goes in
        # if isinstance(L, TriangularLazyTensor):
        #     # The whole thing is triangular, we can just do two triangular solves
        #     ...
        # else:

        L = delazify(L)
        num_fant = schur_root.size(-2)
        new_root = torch.zeros(*batch_shape,
                               m + num_fant,
                               n + num_fant,
                               device=L.device,
                               dtype=L.dtype)
        new_root[..., :m, :n] = L
        new_root[..., m:, :n] = lower_left
        new_root[..., m:, n:] = schur_root

        # Use pseudo-inverse of Z as new inv root

        if new_root.shape[-1] <= 2048:
            # Dispatch to CPU so long as pytorch/pytorch#22573 is not fixed
            device = new_root.device
            Q, R = torch.qr(new_root.cpu())
            Q = Q.to(device)
            R = R.to(device)
        else:
            Q, R = torch.qr(new_root)

        Rdiag = torch.diagonal(R, dim1=-2, dim2=-1)
        # if R is almost singular, add jitter
        zeroish = Rdiag.abs() < 1e-6
        if torch.any(zeroish):
            # can't use in-place operation here b/c it would mess up backward pass
            # haven't found a more elegant way to add a jitter diagonal yet...
            jitter_diag = 1e-6 * torch.sign(Rdiag) * zeroish.to(Rdiag)
            R = R + torch.diag_embed(jitter_diag)
        new_covar_cache = torch.triangular_solve(Q.transpose(-2, -1),
                                                 R)[0].transpose(-2, -1)

        # Expand inputs accordingly if necessary (for fantasies at the same points)
        if full_inputs[0].dim() <= full_targets.dim():
            fant_batch_shape = full_targets.shape[:1]
            n_batch = len(full_mean.shape[:-1])
            repeat_shape = fant_batch_shape + torch.Size([1] * n_batch)
            full_inputs = [
                fi.expand(fant_batch_shape + fi.shape) for fi in full_inputs
            ]
            full_mean = full_mean.expand(fant_batch_shape + full_mean.shape)
            full_covar = BatchRepeatLazyTensor(full_covar, repeat_shape)
            new_root = BatchRepeatLazyTensor(NonLazyTensor(new_root),
                                             repeat_shape)
            # no need to repeat the covar cache, broadcasting will do the right thing

        # Create new DefaultPredictionStrategy object
        fant_strat = self.__class__(
            train_inputs=full_inputs,
            train_prior_dist=self.train_prior_dist.__class__(
                full_mean, full_covar),
            train_labels=full_targets,
            likelihood=fant_likelihood,
            root=new_root,
            inv_root=new_covar_cache,
        )
        add_to_cache(fant_strat, "mean_cache", fant_mean_cache)
        add_to_cache(fant_strat, "covar_cache", new_covar_cache)
        return fant_strat
Exemple #12
0
 def _cholesky_factor(self, induc_induc_covar):
     # Maybe used - if we're not using CG
     L = psd_safe_cholesky(delazify(induc_induc_covar))
     return L