Beispiel #1
0
    def predict_f(self,
                  Xnew: InputData,
                  full_cov: bool = False,
                  full_output_cov: bool = False) -> MeanAndVariance:
        """
        Compute the mean and variance of the latent function at some new points.
        Note that this is very similar to the SGPR prediction, for which
        there are notes in the SGPR notebook.

        Note: This model does not allow full output covariances.

        :param Xnew: points at which to predict
        """
        if full_output_cov:
            raise NotImplementedError

        pX = DiagonalGaussian(self.X_data_mean, self.X_data_var)

        Y_data = self.data
        num_inducing = self.inducing_variable.num_inducing
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0,
        )
        jitter = default_jitter()
        Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)
        L = tf.linalg.cholesky(
            covariances.Kuu(self.inducing_variable, self.kernel,
                            jitter=jitter))

        A = tf.linalg.triangular_solve(L, tf.transpose(psi1),
                                       lower=True) / sigma
        tmp = tf.linalg.triangular_solve(L, psi2, lower=True)
        AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp),
                                         lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=default_float())
        LB = tf.linalg.cholesky(B)
        c = tf.linalg.triangular_solve(
            LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma
        tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True)
        tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True)
        mean = tf.linalg.matmul(tmp2, c, transpose_a=True)
        if full_cov:
            var = (self.kernel(Xnew) +
                   tf.linalg.matmul(tmp2, tmp2, transpose_a=True) -
                   tf.linalg.matmul(tmp1, tmp1, transpose_a=True))
            shape = tf.stack([1, 1, tf.shape(Y_data)[1]])
            var = tf.tile(tf.expand_dims(var, 2), shape)
        else:
            var = (self.kernel(Xnew, full_cov=False) +
                   tf.reduce_sum(tf.square(tmp2), axis=0) -
                   tf.reduce_sum(tf.square(tmp1), axis=0))
            shape = tf.stack([1, tf.shape(Y_data)[1]])
            var = tf.tile(tf.expand_dims(var, 1), shape)
        return mean + self.mean_function(Xnew), var
Beispiel #2
0
 def build_cholesky_if_needed(self):
     # # make sure we only compute this once
     # if self.needs_build_cholesky:
     self.Ku = covs.Kuu(self.feature,
                        self.kern,
                        jitter=gpflow.default_jitter())
     self.Lu = tf.linalg.cholesky(self.Ku)
     self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1])
     self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1])
Beispiel #3
0
    def __call__(self, Xnew, full_cov=False, full_output_cov=False):
        q_mu = self.q_mu  # M x K x O
        q_sqrt = self.q_sqrt  # K x O x M x M

        Kuu = covariances.Kuu(self.inducing_variables,
                              self.kernel,
                              jitter=default_jitter())  # K x M x M
        Kuf = covariances.Kuf(self.inducing_variables, self.kernel,
                              Xnew)  # K x M x N
        Knn = self.kernel.K(Xnew, full_output_cov=False)
Beispiel #4
0
    def compute_qu(self, full_cov: bool = True) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Computes the mean and variance of q(u) = N(mu, cov), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        The derivation is at follows:
        q(u)=N(u | m, S)
        with:
        S=Kuu^{-1}+ [Kuu^{-1}* Kuf * Kfu * Kuu^{-1} * beta]
        m=S^{-1} Kuu^{-1} Kuf y beta

        were sigma^-2 = beta
        
        :return: mu, cov
        """

        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        # num_inducing = self.inducing_variable.num_inducing

        #E_qx[Kfu]
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        #E_qx[Kuf@Kfu]
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0)

        kuu = covariances.Kuu(self.inducing_variable,
                              self.kernel,
                              jitter=default_jitter())
        kuf = tf.transpose(psi1)

        sig = kuu + psi2 * (self.likelihood.variance**-1)
        sig_sqrt = tf.linalg.cholesky(sig)

        sig_sqrt_kuu = tf.linalg.triangular_solve(sig_sqrt, kuu)
        # [M,M] -> [M(M +1)//2] =/= [M,D]

        cov = tf.linalg.matmul(sig_sqrt_kuu, sig_sqrt_kuu, transpose_a=True)

        err = Y_data - self.mean_function(X_data_mean)

        mu = (tf.linalg.matmul(sig_sqrt_kuu,
                               tf.linalg.triangular_solve(
                                   sig_sqrt, tf.linalg.matmul(kuf, err)),
                               transpose_a=True) / self.likelihood.variance)
        if not full_cov:
            return mu, cov
        else:
            return mu, tf.tile(cov[None, :, :], [mu.shape[-1], 1, 1])
Beispiel #5
0
def _test_cg_svgp(config: ConfigDense,
                  model: SVGP,
                  Xnew: tf.Tensor) -> tf.Tensor:
  """
  Sample generation subroutine common to each unit test
  """
  # Prepare preconditioner for CG
  Z = model.inducing_variable
  Kff = covariances.Kuu(Z, model.kernel, jitter=0)
  max_rank = config.num_cond//(2 if config.num_cond > 1 else 1)
  preconditioner = get_default_preconditioner(Kff,
                                              diag=default_jitter(),
                                              max_rank=max_rank)

  count = 0
  samples = []
  L_joint = None
  while count < config.num_samples:
    # Sample $u ~ N(q_mu, q_sqrt q_sqrt^{T})$
    size = min(config.shard_size, config.num_samples - count)
    shape = model.num_latent_gps, config.num_cond, size
    rvs = tf.random.normal(shape=shape, dtype=floatx())
    u = tf.transpose(model.q_sqrt @ rvs)

    # Generate draws from the joint distribution $p(f(X), g(Z))$
    (f, fnew), L_joint = common.sample_joint(model.kernel,
                                             Z,
                                             Xnew,
                                             num_samples=size,
                                             L=L_joint)

    # Solve for update functions
    update_fns = cg_update(model.kernel,
                           Z,
                           u,
                           f,
                           tol=1e-6,
                           max_iter=config.num_cond,
                           preconditioner=preconditioner)

    samples.append(fnew + update_fns(Xnew))
    count += size

  samples = tf.concat(samples, axis=0)
  if model.mean_function is not None:
    samples += model.mean_function(Xnew)
  return samples
Beispiel #6
0
    def _precompute(self):
        Kuu = cov.Kuu(self.inducing_variable, self.kernel)  # this is now a LinearOperator

        q_mu = self._q_dist.q_mu
        q_sqrt = self._q_dist.q_sqrt

        if self.whiten:
            raise NotImplementedError
        else:
            # alpha = Kuu⁻¹ q_mu
            alpha = Kuu.solve(q_mu)  # type: tf.Tensor

        if self.whiten:
            raise NotImplementedError
        else:
            # Qinv = Kuu⁻¹ - Kuu⁻¹ S Kuu⁻¹
            KuuInv_qsqrt = Kuu.solve(q_sqrt)
            KuuInv_covu_KuuInv = tf.matmul(KuuInv_qsqrt, KuuInv_qsqrt, transpose_b=True)

        Qinv = Kuu.inverse().to_dense() - KuuInv_covu_KuuInv

        return alpha, Qinv
Beispiel #7
0
    def custom_predict_f(self,
                         Xnew: InputData,
                         full_cov: bool = False,
                         full_output_cov: bool = False) -> MeanAndVariance:
        """
        Compute the mean and variance of the latent function at some new points.
        Note that this is very similar to the SGPR prediction, for which
        there are notes in the SGPR notebook.

        Note: This model does not allow full output covariances.

        :param Xnew: points at which to predict
        """
        if full_output_cov:
            raise NotImplementedError

        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        mu, cov = self.compute_qu()

        jitter = default_jitter()
        Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
        L = tf.linalg.cholesky(
            covariances.Kuu(self.inducing_variable, self.kernel,
                            jitter=jitter))

        var = cov

        tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True)  #L^{-1} K_{us}
        tmp2 = tf.linalg.triangular_solve(L, mu, lower=True)  # L^{-1} m

        mean = tf.linalg.matmul(
            tmp1, tmp2, transpose_a=True
        )  #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook
        return mean + self.mean_function(Xnew), var
Beispiel #8
0
def uncertain_conditional_diag(
    Xnew_mu: tf.Tensor,
    Xnew_var: tf.Tensor,
    inducing_variable: InducingVariables,
    kernel: Kernel,
    q_mu,
    q_sqrt,
    *,
    mean_function=None,
    full_output_cov=False,
    full_cov=False,
    white=False,
):
    """
    Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var).
    See ``conditional`` documentation for further reference.
    :param Xnew_mu: mean of the inputs, size [N, D]in
    :param Xnew_var: covariance matrix of the inputs, size [N, n, n]
    :param inducing_variable: gpflow.InducingVariable object, only InducingPoints is supported
    :param kernel: gpflow kernel object.
    :param q_mu: mean inducing points, size [M, Dout]
    :param q_sqrt: cholesky of the covariance matrix of the inducing points, size [t, M, M]
    :param full_output_cov: boolean wheter to compute covariance between output dimension.
                            Influences the shape of return value ``fvar``. Default is False
    :param white: boolean whether to use whitened representation. Default is False.
    :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is [N, Dout],
            size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is [N, t, t],
            if False then ``f_var`` is [N, Dout]
    """

    if not isinstance(inducing_variable, InducingPoints):
        raise NotImplementedError

    if full_cov:
        raise NotImplementedError(
            "uncertain_conditional() currently does not support full_cov=True")

    # pX = DiagonalGaussian(self.X_data_mean, self.X_data_var)

    # Y_data = self.data
    # mu, cov = self.compute_qu()

    # jitter = default_jitter()
    # Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
    # L = tf.linalg.cholesky(covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter))

    # var = cov

    # tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us}
    # tmp2 = tf.linalg.triangular_solve(L, mu, lower=True)  # L^{-1} m

    # mean = tf.linalg.matmul(tmp1, tmp2, transpose_a=True) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook
    # return mean + self.mean_function(Xnew), var

    pXnew = DiagonalGaussian(Xnew_mu, Xnew_var)

    num_data = tf.shape(Xnew_mu)[0]  # number of new inputs (N)
    num_ind, num_func = tf.unstack(
        tf.shape(q_mu), num=2,
        axis=0)  # number of inducing points (M), output dimension (D)
    q_sqrt_r = tf.linalg.band_part(
        q_sqrt, -1, 0)  # [D, M, M] #taking the lower triangular part

    eKuf = tf.transpose(expectation(
        pXnew, (kernel, inducing_variable)))  # [M, N] (psi1)
    Kuu = covariances.Kuu(inducing_variable, kernel,
                          jitter=default_jitter())  # [M, M]
    Luu = tf.linalg.cholesky(Kuu)  # [M, M]

    if not white:
        q_mu = tf.linalg.triangular_solve(Luu, q_mu, lower=True)
        Luu_tiled = tf.tile(
            Luu[None, :, :],
            [num_func, 1, 1])  # remove line once issue 216 is fixed
        q_sqrt_r = tf.linalg.triangular_solve(Luu_tiled, q_sqrt_r, lower=True)

    Li_eKuf = tf.linalg.triangular_solve(Luu, eKuf, lower=True)  # [M, N]
    fmean = tf.linalg.matmul(Li_eKuf, q_mu, transpose_a=True)

    eKff = expectation(pXnew, kernel)  # N (psi0)
    eKuffu = expectation(pXnew, (kernel, inducing_variable),
                         (kernel, inducing_variable))  # [N, M, M] (psi2)
    Luu_tiled = tf.tile(
        Luu[None, :, :],
        [num_data, 1, 1])  # remove this line, once issue 216 is fixed
    Li_eKuffu = tf.linalg.triangular_solve(Luu_tiled, eKuffu, lower=True)
    Li_eKuffu_Lit = tf.linalg.triangular_solve(Luu_tiled,
                                               tf.linalg.adjoint(Li_eKuffu),
                                               lower=True)  # [N, M, M]
    cov = tf.linalg.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True)  # [D, M, M]

    if mean_function is None or isinstance(mean_function, mean_functions.Zero):
        e_related_to_mean = tf.zeros((num_data, num_func, num_func),
                                     dtype=default_float())
    else:
        # Update mean: \mu(x) + m(x)
        fmean = fmean + expectation(pXnew, mean_function)

        # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T,
        # where m(x) is the mean_function and \mu(x) is fmean
        e_mean_mean = expectation(pXnew, mean_function,
                                  mean_function)  # [N, D, D]
        Lit_q_mu = tf.linalg.triangular_solve(Luu, q_mu, adjoint=True)
        e_mean_Kuf = expectation(pXnew, mean_function,
                                 (kernel, inducing_variable))  # [N, D, M]
        # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor:
        e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind])
        e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf,
                                 Lit_q_mu)  # [N, D, D]
        e_related_to_mean = e_fmean_mean + tf.linalg.adjoint(
            e_fmean_mean) + e_mean_mean

    if full_output_cov:
        fvar = (
            tf.linalg.diag(
                tf.tile((eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None],
                        [1, num_func])) +
            tf.linalg.diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) +
            # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) +
            tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) -
            # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
            fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean)
    else:
        fvar = (
            (eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None] +
            tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)
            # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) +
            + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu)

            # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
            - fmean**2 + tf.linalg.diag_part(e_related_to_mean))

    return fmean, fvar
def conditional_vff(Xnew,
                    inducing_variable,
                    kernel,
                    f,
                    *,
                    full_cov=False,
                    full_output_cov=False,
                    q_sqrt=None,
                    white=False):
    """
     - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d)
     - feat is an instance of features.InducingFeature that provides `Kuu` and `Kuf` methods
       for Fourier features, this contains the limits of the bounding box and the frequencies
     - f is the value (or mean value) of the features (i.e. the weights)
     - q_sqrt (default None) is the Cholesky factor of the uncertainty about f
       (to be propagated through the conditional as per the GPflow inducing-point implementation)
     - white (defaults False) specifies whether the whitening has been applied

    Given the GP represented by the inducing points specified in `feat`, produce the mean and
    (co-)variance of the GP at the points Xnew.

       Xnew :: N x D
       Kuu :: M x M
       Kuf :: M x N
       f :: M x K, K = 1
       q_sqrt :: K x M x M, with K = 1
    """
    if full_output_cov:
        raise NotImplementedError

    # num_data = tf.shape(Xnew)[0]  # M
    num_func = tf.shape(f)[1]  # K

    Kuu = cov.Kuu(inducing_variable, kernel)  # this is now a LinearOperator
    Kuf = cov.Kuf(inducing_variable, kernel, Xnew)  # still a Tensor

    KuuInv_Kuf = Kuu.solve(Kuf)

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = kernel(Xnew) - tf.matmul(Kuf, KuuInv_Kuf, transpose_a=True)
        shape = (num_func, 1, 1)
    else:
        KufT_KuuInv_Kuf_diag = tf.reduce_sum(Kuf * KuuInv_Kuf, axis=-2)
        fvar = kernel(Xnew, full=False) - KufT_KuuInv_Kuf_diag
        shape = (num_func, 1)
    fvar = tf.expand_dims(fvar, 0) * tf.ones(
        shape, dtype=gpflow.default_float())  # K x N x N or K x N

    # another backsubstitution in the unwhitened case
    if white:
        raise NotImplementedError

    A = KuuInv_Kuf

    # construct the conditional mean
    fmean = tf.matmul(A, f, transpose_a=True)

    if q_sqrt is not None:
        if q_sqrt.get_shape().ndims == 2:
            # LTA = A * tf.expand_dims(q_sqrt, 2)  # K x M x N
            # won't work  # make ticket for this?
            raise NotImplementedError
        elif q_sqrt.get_shape().ndims == 3:
            # L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0)  # K x M x M

            # K x M x N
            # A_tiled = tf.expand_dims(A.get(), 0) * tf.ones((num_func, 1, 1), dtype=float_type)

            # LTA = tf.matmul(L, A_tiled, transpose_a=True)  # K x M x N
            # TODO the following won't work for K > 1
            assert q_sqrt.shape[0] == 1
            # LTA = (A.T @ DenseMatrix(q_sqrt[:,:,0])).T.get()[None, :, :]
            ATL = tf.matmul(A, q_sqrt, transpose_a=True)
        else:
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            # fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x N x N
            fvar = fvar + tf.matmul(ATL, ATL, transpose_b=True)  # K x N x N
        else:
            # fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # K x N
            fvar = fvar + tf.reduce_sum(tf.square(ATL), 2)  # K x N
    fvar = tf.transpose(fvar)  # N x K or N x N x K

    return fmean, fvar
def prior_kl_vff(inducing_variable, kernel, q_mu, q_sqrt, whiten=False):
    if whiten:
        raise NotImplementedError
    K = cov.Kuu(inducing_variable, kernel)
    return gauss_kl_vff(q_mu, q_sqrt, K)
    def _conditional_fused(self, Xnew, full_cov, full_output_cov):
        """
        Xnew is a tensor with the points of the data or minibatch, shape N x D
        """
        if full_output_cov:
            raise NotImplementedError

        f = self._q_dist.q_mu
        q_sqrt = self._q_dist.q_sqrt

        # num_data = tf.shape(Xnew)[0]  # M
        num_func = tf.shape(f)[1]  # K

        Kuu = cov.Kuu(self.X_data, self.kernel)  # this is now a LinearOperator
        Kuf = cov.Kuf(self.X_data, self.kernel, Xnew)  # still a Tensor

        KuuInv_Kuf = Kuu.solve(Kuf)

        # compute the covariance due to the conditioning
        if full_cov:
            fvar = self.kernel(Xnew) - tf.matmul(
                Kuf, KuuInv_Kuf, transpose_a=True)
            shape = (num_func, 1, 1)
        else:
            KufT_KuuInv_Kuf_diag = tf.reduce_sum(Kuf * KuuInv_Kuf, axis=-2)
            fvar = self.kernel(Xnew, full_cov=False) - KufT_KuuInv_Kuf_diag
            shape = (num_func, 1)
        fvar = tf.expand_dims(fvar, 0) * tf.ones(
            shape, dtype=gpflow.default_float())  # K x N x N or K x N

        if self.whiten:
            raise NotImplementedError

        A = KuuInv_Kuf

        # construct the conditional mean
        fmean = tf.matmul(A, f, transpose_a=True)

        if q_sqrt is not None:
            if q_sqrt.get_shape().ndims == 2:
                # LTA = A * tf.expand_dims(q_sqrt, 2)  # K x M x N
                # won't work  # make ticket for this?
                raise NotImplementedError
            elif q_sqrt.get_shape().ndims == 3:
                # L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0)  # K x M x M

                # K x M x N
                # A_tiled = tf.expand_dims(A.get(), 0) * tf.ones((num_func, 1, 1), dtype=float_type)

                # LTA = tf.matmul(L, A_tiled, transpose_a=True)  # K x M x N
                # TODO the following won't work for K > 1
                assert q_sqrt.shape[0] == 1
                # LTA = (A.T @ DenseMatrix(q_sqrt[:,:,0])).T.get()[None, :, :]
                ATL = tf.matmul(A, q_sqrt, transpose_a=True)
            else:
                raise ValueError("Bad dimension for q_sqrt: %s" %
                                 str(q_sqrt.get_shape().ndims))
            if full_cov:
                # fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x N x N
                fvar = fvar + tf.matmul(ATL, ATL,
                                        transpose_b=True)  # K x N x N
            else:
                # fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # K x N
                fvar = fvar + tf.reduce_sum(tf.square(ATL), 2)  # K x N
        fvar = tf.transpose(fvar)  # N x K or N x N x K

        return fmean, fvar
Beispiel #12
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))

        Y = self.data

        qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
        psi0s = expectation(qXs, self.kernel_s)
        psi1s = expectation(qXs, (self.kernel_s, self.Zs))
        psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
        cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        Ls = tf.linalg.cholesky(cov_uu_s)
        Ls = repeat_N(Ls) # [N x M x M]

        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)
        psi0k = []
        psi1k = []
        psi2k = []
        psi2ks = []
        psi2sk = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0 = expectation(qXp, kernel_k)
            psi1 = expectation(qXp, (kernel_k, self.Zp))
            psi2 = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            psi0k.append(psi0)            
            psi1k.append(psi1)
            psi2k.append(psi2)
            # add the cross-covariance terms, require computation separately for each n
            psi2sk.append(tf.vectorized_map(row_outer_product, (psi1s, psi1)))
            #psi2ks.append(tf.vectorized_map(row_outer_product, (psi1, psi1s)))
        psi0k = tf.stack(psi0k, axis=-1)
        psi1k = tf.stack(psi1k, axis=-1)
        psi2k = tf.stack(psi2k, axis=-1)
        psi2sk = tf.stack(psi2sk, axis=-1)
        #psi2ks = tf.stack(psi2ks, axis=-1)  

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        Lk = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            Lk.append(tf.linalg.cholesky(cov_uu_k))
        Lk = tf.stack(Lk, axis=-1)
        Lk = repeat_N(Lk)
        
        sigma2 = self.likelihood.variance
        jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())

        tmp = tf.vectorized_map(triang_solve, (Ls, psi2s))
        As = tf.vectorized_map(triang_solve_transpose, (Ls, tmp)) # \inv{Kuu^s} * Psi2s: [N, M, M]

        LBs = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2s) # [N, M, M]  
        tmp1 = tf.vectorized_map(triang_solve, (Ls, LBs)) # [N, M, M]
        Cs = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1)) # sqrt(\inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, M, M]
        Ds = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), Cs)) # sqrt(Ms^T * \inv{Kuu^s} * Psi2s * \inv{Kuu^s} * Ms): [N, D, M]

        Fs = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt_s, perm=[0, 2, 1])), Cs)) # sqrt(Ss * \inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, D, M, M]

        tmp2 = tf.vectorized_map(triang_solve, (Ls, repeat_N(self.q_mu_s)))
        Es = tf.vectorized_map(triang_solve_transpose, (Ls, tmp2)) # \inv{Kuu^s} * Ms: [N, M, D]
        tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1s)) # Y^T * Psi1: [N, D, M]
        Gs = tf.vectorized_map(matmul_vectorized, (tmp3, Es)) # Y^T * Psi1s * \inv{Kuu^s} * Ms: [N, D, D]

        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            tmp = tf.vectorized_map(triang_solve, (Lk[..., k], psi2k[..., k])) # [N, M, M]
            Ak = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp)) # \inv{Kuu^k} * Psi2k: [N, M, M]

            LBk = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2k[..., k]) # [N, M, M]  
            tmp1k = tf.vectorized_map(triang_solve, (Lk[..., k], LBk)) # [N, M, M]
            Ck = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp1k)) # sqrt(\inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, M, M]
            Dk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), Ck)) # sqrt(Mk^T * \inv{Kuu^k} * Psi2k * \inv{Kuu^k} * Mk): [N, D, M]

            # q_sqrt is already the cholesky
            Fk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), Ck)) # sqrt(Sk * \inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, D, M, M]

            tmp2 = tf.vectorized_map(triang_solve, (Lk[..., k], repeat_N(self.q_mu[k])))
            Ek = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp2)) # \inv{Kuu^k} * Mk: [N, M, D]
            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1k[..., k])) # Y^T * Psi1k: [N, D, M]
            Gk = tf.vectorized_map(matmul_vectorized, (tmp3, Ek)) # Y^T * Psi1k * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the cross terms 
            tmp1sk = tf.vectorized_map(triang_solve, (Ls, psi2sk[..., k]))
            tmp2sk = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1sk)) # \inv{Kuu^s} * Psi2sk: [N, M, M]
            tmp3sk = tf.vectorized_map(matmul_vectorized, (tmp2sk, Ek)) # \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, M, D]
            Dsk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), tmp3sk)) # Ms^T * \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(trace_tf, Gs + Gk) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Ds) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Dk) / sigma2
            # the sum of trace of the 2 cross terms is 2 times the trace of one since they are transpose of one another
            Fnk += - tf.vectorized_map(trace_tf, Dsk) / sigma2 
            Fnk += 0.5 * self.D * tf.vectorized_map(trace_tf, As + Ak)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fs) / sigma2
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fk) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * (tf.repeat(tf.expand_dims(psi0s, -1), self.K, axis=1) + psi0k) / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
        
        prior_Kuu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        KL_us = kullback_leiblers.gauss_kl(q_mu=self.q_mu_s, q_sqrt=self.q_sqrt_s, K=prior_Kuu_s)
        KL_uk = 0
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_uk += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu_k)
        bound += - KL_s - KL_p - KL_us - KL_uk - KL_c

        return bound
Beispiel #13
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))


        Y = self.data

        # specify qXp, the variational distribution q(X): each x_n is independent w/ N(x_n | \mu_n, S_n)
        # \mu_n \in R^q given by each row of `X_data_mean`
        # S_n \in R^qxq diagonal, so equivalently given by each row of `X_data_var`
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)

        # if split space, specify qXs
        # compute psi statistics for the shared space, keep the original shape of psi statistics, use qXs and kernel_s
        # psi0s is N-vector
        # psi1s is [N, M]
        # psi2s is [N, M, M]
        # also compute the covariance matrix Kuu for the shared space
        if self.split_space:
            qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
            psi0s = expectation(qXs, self.kernel_s)
            psi1s = expectation(qXs, (self.kernel_s, self.Zs))
            psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
            cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())


        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: if no shared space, then psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # if have shared space, then psi0[:, k] = psi0s + psi0k, psi1[:, :, k] = psi1s + psi1k
        # psi2[:, :, :, k] = psi2s + psi2k (the cross terms are added later)
        # then, for each n, psi2[n, :, :, k] = psi1s[n, :]^T dot psi1k[n, :] + psi1k[n, :]^T dot psi1s[n, :] (both are [M, M])
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        psi0 = []
        psi1 = []
        psi2 = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0k = expectation(qXp, kernel_k)
            psi1k = expectation(qXp, (kernel_k, self.Zp))
            psi2k = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            if self.split_space:
                psi0.append(psi0s + psi0k)            
                psi1.append(psi1s + psi1k)
                # add the cross-covariance terms, require computation separately for each n
                sxk = tf.vectorized_map(row_outer_product, (psi1s, psi1k))
                kxs = tf.vectorized_map(row_outer_product, (psi1k, psi1s))
                psi2.append(psi2s + psi2k + sxk + kxs)
            else:
                psi0.append(psi0k)
                psi1.append(psi1k)
                psi2.append(psi2k)
        psi0 = tf.stack(psi0, axis=-1)
        psi1 = tf.stack(psi1, axis=-1)
        psi2 = tf.stack(psi2, axis=-1)

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        L = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            if self.split_space:
                L.append(tf.linalg.cholesky(cov_uu_s + cov_uu_k))
            else:
                L.append(tf.linalg.cholesky(cov_uu_k))
        L = tf.stack(L, axis=-1)
        L = repeat_N(L)
        sigma2 = self.likelihood.variance


        # self.pred_Y = []

        # use `tf.vectorized_map` to avoid writing a loop over N, but it requires every matrix to have N on axis 0
        # so we need to repeat certain matrices that are the same for all N (e.g. L)
        # note we can use `tf.vectorized_map` because the computations are decomposable for each n,
        # i.e. they can be computed in any order over n
        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            # compute intermediate matrices for easier computation involving \inv{Kuu}
            # A is the same as AAT in gplvm, transposing L is the correct thing to do
            # but the two end up being the same since we only care about the trace
            tmp = tf.vectorized_map(triang_solve, (L[..., k], psi2[..., k])) # [N, M, M]
            A = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp)) # \inv{Kuu} * Psi2: [N, M, M]

            #pos_def = tf.vectorized_map(lambda x: is_pos_def(x), psi2[..., k])
            #print(np.all(pos_def))
            # psi2 is not produced using w/ `covariances.Kuu`, but it should still be PD
            # we should add jitter before doing cholesky
            #jitter_mtx = default_jitter() * tf.eye(self.M, dtype=default_float())
            jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())
            LB = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2[..., k]) # [N, M, M]  
            tmp1 = tf.vectorized_map(triang_solve, (L[..., k], LB)) # [N, M, M]
            C = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp1)) # sqrt(\inv{Kuu} * Psi2 * \inv{Kuu}): [N, M, M]

            D = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), C)) # sqrt(M^T * \inv{Kuu} * Psi2 * \inv{Kuu} * M): [N, D, M]

            tmp2 = tf.vectorized_map(triang_solve, (L[..., k], repeat_N(self.q_mu[k])))
            E = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp2)) # \inv{Kuu} * M: [N, M, D]

            # q_sqrt is already the cholesky
            F = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), C)) # sqrt(S * \inv{Kuu} * Psi2 * \inv{Kuu}): [N, D, M, M]

            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1[..., k])) # Y^T * Psi1: [N, D, M]
            G = tf.vectorized_map(matmul_vectorized, (tmp3, E)) # Y^T * Psi1 * \inv{Kuu} * M: [N, D, D]

            # for debugging 
            # self.pred_Y.append(tf.reshape(tf.vectorized_map(matmul_vectorized, (tf.expand_dims(psi1[..., k], 1), E)), (self.N, self.D))) # Psi1 * \inv{Kuu} * M: [N, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(lambda x: trace_tf(x), G) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), D) / sigma2
            Fnk += 0.5 * self.D * tf.vectorized_map(lambda x: trace_tf(x), A)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(lambda x: sum_d_trace(x), F) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * psi0 / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # for debugging 
        #self.Fq = Fq
        # self.pred_Y = tf.stack(self.pred_Y, axis=-1) # [N, D, K]

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_u = 0
        prior_Kuu = np.zeros((self.M, self.M))
        if self.split_space:
            KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
            bound += - KL_s
            prior_Kuu += covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_u += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu+prior_Kuu_k)
        bound += - KL_p - KL_u - KL_c

        return bound
Beispiel #14
0
def approx_conditional_ldf(
    Xnew,
    inducing_variable,
    kernel,
    f,
    *,
    full_cov=False,
    full_output_cov=False,
    q_sqrt=None,
    white=True,
):
    """
     - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d)
     - inducing_variable is an instance of inducing_variables.InducingVariable that provides
       `Kuu` and `Kuf` methods for Laplacian Dirichlet features, this contains the limits of
       the bounding box and the frequencies
     - remainder_variable is another instance of inducing_variables.InducingVariable that specifies
       the high frequency components not selected in inducing_variable.
     - f is the value (or mean value) of the features (i.e. the weights)
     - q_sqrt (default None) is the Cholesky factor of the uncertainty about f
       (to be propagated through the conditional as per the GPflow inducing-point implementation)
     - white (defaults False) specifies whether the whitening has been applied. LDF works a lot better,
         when using vanilla gradients, if whitening has been applied, so it's the default option.

    Given the GP represented by the inducing points specified in `inducing_variable`, produce the mean
    and (co-)variance of the GP at the points Xnew.

       Xnew :: N x D
       Kuu :: M x M
       Kuf :: M x N
       f :: M x K, K = 1
       q_sqrt :: K x M x M, with K = 1
    """
    if full_output_cov:
        raise NotImplementedError

    # num_data = tf.shape(Xnew)[0]  # M
    num_func = tf.shape(f)[1]  # K

    Λ = cov.Kuu(inducing_variable, kernel)  # this is now a LinearOperator
    Φ = cov.Kuf(inducing_variable, kernel, Xnew)  # still a Tensor
    Λr = cov.Kuu(inducing_variable.remainder, kernel)
    Φr = cov.Kuf(inducing_variable.remainder, kernel, Xnew)

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = tf.matmul(Φr, Λr.solve(Φr), transpose_a=True)
        shape = (num_func, 1, 1)
    else:
        fvar = tf.reduce_sum(Φr * Λr.solve(Φr), -2)
        shape = (num_func, 1)
    fvar = tf.expand_dims(fvar, 0) * tf.ones(
        shape, dtype=gpflow.default_float())  # K x N x N or K x N

    # another backsubstitution in the unwhitened case
    if white:
        A = Λ.cholesky().solve(Φ)
    else:
        A = Λ.solve(Φ)

    # construct the conditional mean
    fmean = tf.matmul(A, f, transpose_a=True)

    if q_sqrt is not None:
        if q_sqrt.shape.ndims == 2:
            # case for q_diag = True
            LTA = Diag(q_sqrt) @ A  # K x M x N
        elif q_sqrt.shape.ndims == 3:
            LTA = tf.matmul(q_sqrt, A, transpose_a=True)
        else:
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x N x N
        else:
            fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # K x N
    fvar = tf.transpose(fvar)  # N x K or N x N x K

    return fmean, fvar
Beispiel #15
0
def prior_kl_ldf(inducing_variable, kernel, q_mu, q_sqrt, whiten=False):
    if whiten:
        K = None
    else:
        K = cov.Kuu(inducing_variable, kernel)
    return gauss_kl_ldf(q_mu, q_sqrt, K)
Beispiel #16
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """
        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        num_inducing = self.inducing_variable.num_inducing
        psi0 = tf.reduce_sum(expectation(pX, self.kernel))
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0)

        cov_uu = covariances.Kuu(self.inducing_variable,
                                 self.kernel,
                                 jitter=default_jitter())
        L = tf.linalg.cholesky(cov_uu)
        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.linalg.triangular_solve(L, tf.transpose(psi1),
                                       lower=True) / sigma
        tmp = tf.linalg.triangular_solve(L, psi2, lower=True)
        AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp),
                                         lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=default_float())
        LB = tf.linalg.cholesky(B)
        log_det_B = 2.0 * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB)))
        c = tf.linalg.triangular_solve(
            LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma

        # KL[q(x) || p(x)]
        dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else
                       tf.linalg.diag_part(X_data_var))
        NQ = to_default_float(tf.size(X_data_mean))
        D = to_default_float(tf.shape(Y_data)[1])
        KL = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var))
        KL += 0.5 * tf.reduce_sum(tf.math.log(self.X_prior_var))
        KL -= 0.5 * NQ
        KL += 0.5 * tf.reduce_sum(
            (tf.square(X_data_mean - self.X_prior_mean) + dX_data_var) /
            self.X_prior_var)

        self.loss_placeholder["KL_x"].append(KL.numpy())

        # compute log marginal bound
        ND = to_default_float(tf.size(Y_data))
        bound = -0.5 * ND * tf.math.log(2 * np.pi * sigma2)
        bound += -0.5 * D * log_det_B
        bound += -0.5 * tf.reduce_sum(tf.square(Y_data)) / sigma2
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 -
                             tf.reduce_sum(tf.linalg.diag_part(AAT)))
        bound -= KL

        self.loss_placeholder["ELBO"].append(bound.numpy())

        return bound