def update_W_external(self, X, Y):
        Kdiag = self.kern.Kdiag(X, full_output_cov=False)
        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)

        sigma2 = self.likelihood.variance

        A = tf.cholesky_solve(L, Kux)  # K x M x N
        mean = tf.matmul(A,
                         tf.transpose(self.q_mu)[:, :, None],
                         transpose_a=True)
        err = (Y - mean)

        reg1 = tf.reduce_sum(
            tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2)
        reg2 = tf.transpose(Kdiag) - tf.einsum('kmn,kmn->kn', A, Kux)

        logW = -0.5 * tf.log(2 * np.pi * sigma2) \
            - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \
            - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 + tf.log(self.W_prior)[:, None]

        logW = logW - tf.reduce_logsumexp(logW, axis=0, keepdims=True)
        return tf.transpose(logW)
Ejemplo n.º 2
0
    def update_W_external(self, X, Y, W1_idx, W2_idx):
        Kdiag = self.kern.Kdiag(X, full_output_cov=False)
        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)

        sigma2 = self.likelihood.variance

        A = tf.cholesky_solve(L, Kux)  # K x M x N
        mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True)
        err = (Y - mean)

        reg1 = tf.reduce_sum(
            tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2)
        reg2 = tf.transpose(Kdiag) - \
            tf.einsum('kmn,kmn->kn', A, Kux)

        logW = -0.5 * tf.log(2 * np.pi * sigma2) \
            - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \
            - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2
        logW = tf.reshape(logW, [self.K1, self.K2, -1])

        # compute new W1
        W2 = tf.gather(normalize(self.W2), W2_idx)
        logW1 = tf.reduce_sum(
            logW * tf.transpose(W2)[None, :, :],
            axis=1)
        #logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=0, keepdims=True)

        # group by index
        logW1_parts = tf.dynamic_partition(
            tf.transpose(logW1), W1_idx, num_partitions=self.W1.shape[0])
        logW1 = tf.stack([
            tf.reduce_sum(part, axis=0) for part in logW1_parts])
        logW1 = logW1 + tf.log(self.W1_prior)[None]
        logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=1, keepdims=True)

        # compute new W2
        W1 = tf.gather(normalize(logW1), W1_idx)
        logW2 = tf.reduce_sum(
            logW * tf.transpose(W1)[:, None, :],
            axis=0)
        #logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=0, keepdims=True)

        # group by index
        logW2_parts = tf.dynamic_partition(
            tf.transpose(logW2), W2_idx, num_partitions=self.W2.shape[0])
        logW2 = tf.stack([
            tf.reduce_sum(part, axis=0) for part in logW2_parts])
        logW2 = logW2 + tf.log(self.W2_prior)[None]
        logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=1, keepdims=True)
        return logW1, logW2
Ejemplo n.º 3
0
    def test_equivalence_inducing_points(self):
        # Multiscale must be equivalent to inducing points when variance is zero
        with self.test_context() as session:
            rbf, feature_0lengthscale, feature_inducingpoint = self.prepare()
            Xnew = np.random.randn(13, 3)

            ms, point = session.run([features.Kuf(feature_0lengthscale, rbf, Xnew),
                                     features.Kuf(feature_inducingpoint, rbf, Xnew)])
            pd = np.max(np.abs(ms - point) / point * 100)
            self.assertTrue(pd < 0.1)

            ms, point = session.run([features.Kuu(feature_0lengthscale, rbf),
                                     features.Kuu(feature_inducingpoint, rbf)])
            pd = np.max(np.abs(ms - point) / point * 100)
            self.assertTrue(pd < 0.1)
Ejemplo n.º 4
0
    def compute_upper_bound(self):
        num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type)

        Kdiag = self.kern.Kdiag(self.X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        Kuf = features.Kuf(self.feature, self.kern, self.X)

        L = tf.cholesky(Kuu)
        LB = tf.cholesky(Kuu + self.likelihood.variance**-1.0 *
                         tf.matmul(Kuf, Kuf, transpose_b=True))

        LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True)
        # Using the Trace bound, from Titsias' presentation
        c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf**2.0)
        # Kff = self.kern.K(self.X)
        # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True)

        # Alternative bound on max eigenval:
        # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0))
        corrected_noise = self.likelihood.variance + c

        const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance)
        logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum(
            tf.log(tf.diag_part(LB)))

        LC = tf.cholesky(Kuu + corrected_noise**-1.0 *
                         tf.matmul(Kuf, Kuf, transpose_b=True))
        v = tf.matrix_triangular_solve(LC,
                                       corrected_noise**-1.0 *
                                       tf.matmul(Kuf, self.Y),
                                       lower=True)
        quad = -0.5 * corrected_noise**-1.0 * tf.reduce_sum(
            self.Y**2.0) + 0.5 * tf.reduce_sum(v**2.0)

        return const + logdet + quad
Ejemplo n.º 5
0
 def _build_predict(self, Xnew, full_cov=False):
     """
     Compute the mean and variance of the latent function at some new points
     Xnew. For a derivation of the terms in here, see the associated SGPR
     notebook.
     """
     num_inducing = len(self.feature)
     err = self.Y - self.mean_function(self.X)
     Kuf = features.Kuf(self.feature, self.kern, self.X)
     Kuu = features.Kuu(self.feature,
                        self.kern,
                        jitter=settings.numerics.jitter_level)
     Kus = features.Kuf(self.feature, self.kern, Xnew)
     sigma = tf.sqrt(self.likelihood.variance)
     L = tf.cholesky(Kuu)
     A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma
     B = tf.matmul(A, A, transpose_b=True) + \
         tf.eye(num_inducing, dtype=settings.float_type)
     LB = tf.cholesky(B)
     Aerr = tf.matmul(A, err)
     c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma
     tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True)
     tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True)
     mean = tf.matmul(tmp2, c, transpose_a=True)
     if full_cov:
         var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \
               - tf.matmul(tmp1, tmp1, transpose_a=True)
         var = tf.tile(var[None, ...], [self.num_latent, 1, 1])  # P x N x N
     else:
         var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \
               - tf.reduce_sum(tf.square(tmp1), 0)
         var = tf.tile(var[:, None], [1, self.num_latent])
     return mean + self.mean_function(Xnew), var
Ejemplo n.º 6
0
    def test_matrix_psd(self):
        # Conditional variance must be PSD.
        X = np.random.randn(13, 2)

        def init_feat(feature):
            if feature is gpflow.features.InducingPoints:
                return feature(np.random.randn(71, 2))
            elif feature is gpflow.features.Multiscale:
                return feature(np.random.randn(71, 2), np.random.rand(71, 2))

        featkerns = [(gpflow.features.InducingPoints, gpflow.kernels.RBF),
                     (gpflow.features.InducingPoints, gpflow.kernels.Matern12),
                     (gpflow.features.Multiscale, gpflow.kernels.RBF)]
        for feat_class, kern_class in featkerns:
            with self.test_context() as session:
                # rbf, feature, feature_0lengthscale, feature_inducingpoint = self.prepare()
                kern = kern_class(2, 1.84, lengthscales=[0.143, 1.53])
                feature = init_feat(feat_class)
                Kuf, Kuu = session.run([
                    features.Kuf(feature, kern, X),
                    features.Kuu(feature, kern, jitter=settings.jitter)
                ])
                Kff = kern.compute_K_symm(X)
            Qff = Kuf.T @ np.linalg.solve(Kuu, Kuf)
            self.assertTrue(np.all(np.linalg.eig(Kff - Qff)[0] > 0.0))
Ejemplo n.º 7
0
    def compute_qu(self):
        """
        Computes the mean and variance of q(u), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        :return: mu, A
        """
        Kux = features.Kuf(self.feature, self.kern, self.X)
        psi1 = self._psi1(tf.transpose(Kux))  # K x N x M
        psi2 = self._psi2(tf.transpose(Kux))  # K x M x M

        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        Kuu = tf.tile(Kuu[None], [self.W.shape[1], 1, 1])

        Sig = Kuu + (self.likelihood.variance**-1) * psi2
        Sig_sqrt = tf.cholesky(Sig)

        Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu)

        A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True)

        tmp = tf.matrix_triangular_solve(Sig_sqrt,
                                         tf.transpose(psi1, perm=[0, 2, 1]))
        P = tf.einsum('kmn,nd->kmd', tmp, self.Y)
        mu = tf.matmul(Sig_sqrt_Kuu, P,
                       transpose_a=True) * self.likelihood.variance**-1.0

        return mu, A
Ejemplo n.º 8
0
    def update_W(self):
        Kxu = self.kern.K(self.X, self.feature.Z)
        psi1 = self._psi1(Kxu)  # K x L x N x M
        psi2 = self._psi2(Kxu)  # K x L x M x M

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)

        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(L, tf.transpose(Kxu),
                                       lower=True) / sigma

        # Kxu = tf.tile(Kxu[None, None], [self.K, self.L, 1, 1])
        L = tf.tile(L[None, None], [self.K, self.L, 1, 1])  # K x L x M x M
        A = tf.tile(A[None, None], [self.K, self.L, 1, 1])  # K x L X M x N

        Apsi = tf.matrix_triangular_solve(
            L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma

        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)
        AAT = tf.matrix_triangular_solve(
            L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2
        B = AAT + tf.tile(
            tf.eye(self.num_inducing, dtype=settings.float_type)[None, None],
            [self.K, self.L, 1, 1])
        LB = tf.cholesky(B)

        LBinvA = tf.matrix_triangular_solve(LB, A)  # K x L x M x N
        LBinvApsi = tf.matrix_triangular_solve(LB, Apsi)  # K x L x M x N

        err = self.Y[None] - tf.matmul(tf.transpose(LBinvA, perm=[0, 1, 3, 2]),
                                       tf.einsum('klmn,nd->klmd', LBinvApsi,
                                                 self.Y))  # K x L x N
        err = tf.squeeze(err)

        reg1 = tf.reduce_sum(tf.pow(LBinvA, 2), axis=2)  # K x L x N
        reg2 = self.kern.Kdiag(self.X) / sigma2
        reg2 = reg2 - tf.reduce_sum(tf.pow(A, 2), axis=2)  # K x L x N

        logW = -0.5 * tf.log(2 * np.pi * sigma2) \
            - 0.5 * tf.pow(err, 2) / sigma2 \
            - 0.5 * reg1 \
            - 0.5 * reg2 \
            + tf.transpose(tf.log(self.W1_prior))[:, None] \
            + tf.transpose(tf.log(self.W2_prior))[None]

        logW1 = tf.reduce_sum(logW *
                              tf.transpose(normalize(self.W2))[None, :, :],
                              axis=1)
        logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=0, keepdims=True)

        logW2 = tf.reduce_sum(logW *
                              tf.transpose(normalize(self.W1))[:, None, :],
                              axis=0)
        logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=0, keepdims=True)
        return tf.transpose(logW1), tf.transpose(logW2)
    def compute_qu_external(self, X, Y, W1_idx, W2_idx):
        """
        Computes the mean and variance of q(u), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        :return: mu, A
        """
        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        W1 = tf.gather(normalize(self.W1), W1_idx)  # N x K
        W2 = tf.gather(normalize(self.W2), W2_idx)

        W = _expand_W(W1, W2)
        psi1 = self._psi1(Kux, W)  # K x M x N
        psi2 = self._psi2(Kux, W)  # K x M x M

        Sig = Kuu + (self.likelihood.variance**-1) * psi2
        Sig_sqrt = tf.cholesky(Sig)

        Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu)

        A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True)

        tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1)
        P = tf.einsum('kmn,nd->kmd', tmp, Y)
        mu = tf.matmul(Sig_sqrt_Kuu, P,
                       transpose_a=True) * self.likelihood.variance**-1.0

        return mu[:, :, 0], tf.cholesky(A)
Ejemplo n.º 10
0
    def compute_qu(self):
        """
        Computes the mean and variance of q(u), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        :return: mu, A
        """
        Kxu = self.kern.K(self.X, self.feature.Z)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        Kuu = tf.tile(Kuu[None, None], [self.K, self.L, 1, 1])

        psi1 = self._psi1(Kxu)  # K x L x N x M
        psi2 = self._psi2(Kxu)  # K x L x M x M

        Sig = Kuu + (self.likelihood.variance**-1) * psi2
        Sig_sqrt = tf.cholesky(Sig)

        Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu)

        A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True)

        mu = tf.einsum(
            'klmn,nd->klmd',
            tf.matrix_triangular_solve(Sig_sqrt,
                                       tf.transpose(psi1, perm=[0, 1, 3, 2])),
            self.Y)
        mu = tf.matmul(tf.transpose(Sig_sqrt_Kuu, perm=[0, 1, 3, 2]),
                       mu) * (self.likelihood.variance**-1)

        return mu, A
    def build_prior_KL(self):
        if self.whiten:
            K = None
        else:
            K = features.Kuu(
                self.feature, self.kern,
                jitter=settings.numerics.jitter_level)  # (P x) x M x M

        return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K)
Ejemplo n.º 12
0
    def _build_predict(self, Xnew, full_cov=False):
        """
        Compute the mean and variance of the latent function at some new points
        Xnew. For a derivation of the terms in here, see the associated SGPR
        notebook.
        """
        Kxu = self.kern.K(self.X, self.feature.Z)
        Ksu = self.kern.K(Xnew, self.feature.Z)
        Ksu = tf.tile(Ksu[None, None], [self.K, self.L, 1, 1])

        psi0 = self._psi0()  # scalar
        psi1 = self._psi1(Kxu)  # K x L X N x M
        psi2 = self._psi2(Kxu)  # K x L x M x M

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)
        L = tf.tile(L[None, None], [self.K, self.L, 1, 1])

        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(
            L, tf.transpose(Ksu, perm=[0, 1, 3, 2]), lower=True) / sigma

        Apsi = tf.matrix_triangular_solve(
            L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma

        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)

        AAT = tf.matrix_triangular_solve(
            L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2
        B = AAT + tf.tile(
            tf.eye(self.num_inducing, dtype=settings.float_type)[None, None],
            [self.K, self.L, 1, 1])
        LB = tf.cholesky(B)

        mu = tf.einsum('klmn,nd->klmd', tf.matrix_triangular_solve(LB, Apsi),
                       self.Y)
        mu = tf.matmul(
            tf.transpose(tf.matrix_triangular_solve(LB, A), perm=[0, 1, 3, 2]),
            mu)

        LBinvA = tf.matrix_triangular_solve(LB, A)

        if full_cov:
            var = self.kern.K(Xnew, Xnew)
            var -= tf.matmul(A, A, transpose_a=True) * sigma2
            var += tf.matmul(LBinvA, LBinvA, transpose_a=True) * sigma2

        else:
            var = self.kern.Kdiag(Xnew)[None, None]
            var -= tf.reduce_sum(A**2, axis=2) * sigma2
            var += tf.reduce_sum(LBinvA**2, axis=2) * sigma2

        return mu, var
Ejemplo n.º 13
0
    def _build_likelihood(self):
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood. For a derivation of the terms in here, see the associated
        SGPR notebook.
        """
        ND = tf.cast(tf.size(self.Y), settings.float_type)
        D = tf.cast(tf.shape(self.Y)[1], settings.float_type)

        Kxu = self.kern.K(self.X, self.feature.Z)

        psi0 = self._psi0()  # scalar
        psi1 = self._psi1(Kxu)  # K x L X N x M
        psi2 = self._psi2(Kxu)  # K x L x M x M

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)
        L = tf.tile(L[None, None], [self.K, self.L, 1, 1])

        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(
            L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma

        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)

        AAT = tf.matrix_triangular_solve(
            L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2
        B = AAT + tf.tile(
            tf.eye(self.num_inducing, dtype=settings.float_type)[None, None],
            [self.K, self.L, 1, 1])
        LB = tf.cholesky(B)

        log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB)))

        c = tf.matrix_triangular_solve(
            LB, tf.einsum('klmn,nd->klmd', A, self.Y), lower=True) / sigma

        # KL[q(W) || p(W)]
        W1norm = normalize(self.W1)
        W2norm = normalize(self.W2)

        KL = tf.reduce_sum(W1norm * (tf.log(W1norm) - self.W1_prior[None]))
        KL += tf.reduce_sum(W2norm * (tf.log(W2norm) - self.W2_prior[None]))

        # compute log marginal bound
        bound = -0.5 * ND * tf.log(2 * np.pi * sigma2)
        bound += -0.5 * D * log_det_B
        bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 -
                             tf.reduce_sum(tf.matrix_diag_part(AAT)))
        bound -= KL
        return bound
    def compute_qu(self):
        """
        Computes the mean and variance of q(u), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        :return: mu, A
        """
        Y = self.Y
        X = self.X
        idx = self.idx

        W1_idx = self.W1_idx
        W2_idx = self.W2_idx

        if W1_idx is None:
            W1_idx = idx
        if W2_idx is None:
            W2_idx = idx

        if self.minibatch_size is not None:
            W1 = tf.gather(self.W1, W1_idx)
            W1 = tf.reshape(W1, [-1, self.K1])
            W1 = normalize(W1)

            W2 = tf.gather(self.W2, W2_idx)
            W2 = tf.reshape(W2, [-1, self.K2])
            W2 = normalize(W2)

        else:
            W1 = normalize(self.W1)  # N x K1
            if W1_idx is not None:
                W1 = tf.gather(W1, W1_idx)
            W2 = normalize(self.W2)  # N x K2
            if W2_idx is not None:
                W2 = tf.gather(W2, W2_idx)

        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        W = _expand_W(W1, W2)
        psi1 = self._psi1(Kux, W)  # K x M x N
        psi2 = self._psi2(Kux, W)  # K x M x M

        Sig = Kuu + (self.likelihood.variance**-1) * psi2
        Sig_sqrt = tf.cholesky(Sig)

        Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu)

        A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True)

        tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1)
        P = tf.einsum('kmn,nd->kmd', tmp, Y)
        mu = tf.matmul(Sig_sqrt_Kuu, P,
                       transpose_a=True) * self.likelihood.variance**-1.0

        return mu[:, :, 0], tf.cholesky(A)
Ejemplo n.º 15
0
    def test_inducing_points_equivalence(self):
        # Inducing features must be the same as the kernel evaluations
        with self.test_context() as session:
            Z = np.random.randn(101, 3)
            f = features.InducingPoints(Z)

            kernels = [
                gpflow.kernels.RBF(3, 0.46, lengthscales=np.array([0.143, 1.84, 2.0]), ARD=True),
                gpflow.kernels.Periodic(3, 0.4, 1.8)
            ]

            for k in kernels:
                self.assertTrue(np.allclose(session.run(features.Kuu(f, k)), k.compute_K_symm(Z)))
Ejemplo n.º 16
0
    def _build_likelihood(self):
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood. For a derivation of the terms in here, see the associated
        SGPR notebook.
        """
        ND = tf.cast(tf.size(self.Y), settings.float_type)
        D = tf.cast(tf.shape(self.Y)[1], settings.float_type)

        Kxu = self.kern.K(self.X, self.feature.Z)

        psi0 = self._psi0()
        psi1 = self._psi1(Kxu)
        psi2 = self._psi2(Kxu)

        # Copy this into blocks for each dimension
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)
        L = tf.cholesky(Kuu)
        L = block_diagonal([L for _ in range(self.W.shape[1])])
        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(L, tf.transpose(psi1),
                                       lower=True) / sigma
        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)
        AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp),
                                         lower=True) / sigma2
        B = AAT + tf.eye(self.num_inducing, dtype=settings.float_type)
        LB = tf.cholesky(B)
        log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB)))
        c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y),
                                       lower=True) / sigma

        # KL[q(W) || p(W)]
        KL = tf.reduce_sum(self.Wnorm() *
                           (tf.log(self.Wnorm()) - tf.log(self.W_prior)))

        # compute log marginal bound
        bound = -0.5 * ND * tf.log(2 * np.pi * sigma2)
        bound += -0.5 * D * log_det_B
        bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 -
                             tf.reduce_sum(tf.matrix_diag_part(AAT)))
        bound -= KL
        return bound
Ejemplo n.º 17
0
    def compute_qu(self):
        """
        Computes the mean and variance of q(u), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        :return: mu, A
        """
        Kuf = features.Kuf(self.feature, self.kern, self.X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        Sig = Kuu + (self.likelihood.variance**-1) * tf.matmul(
            Kuf, Kuf, transpose_b=True)
        Sig_sqrt = tf.cholesky(Sig)

        Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu)

        A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True)
        mu = tf.matmul(
            Sig_sqrt_Kuu,
            tf.matrix_triangular_solve(
                Sig_sqrt, tf.matmul(Kuf, self.Y - self.mean_function(self.X))),
            transpose_a=True) * self.likelihood.variance**-1.0

        return mu, A
    def _build_likelihood(self):
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood. For a derivation of the terms in here, see the associated
        SGPR notebook.
        """
        Y = self.Y
        X = self.X
        idx = self.idx

        W1_idx = self.W1_idx
        W2_idx = self.W2_idx

        if W1_idx is None:
            W1_idx = idx
        if W2_idx is None:
            W2_idx = idx

        if self.minibatch_size is not None:
            W1 = tf.gather(self.W1, W1_idx)
            W1 = tf.reshape(W1, [-1, self.K1])
            W1 = normalize(W1)

            W2 = tf.gather(self.W2, W2_idx)
            W2 = tf.reshape(W2, [-1, self.K2])
            W2 = normalize(W2)

        else:
            W1 = normalize(self.W1)  # N x K1
            if W1_idx is not None:
                W1 = tf.gather(W1, W1_idx)
            W2 = normalize(self.W2)  # N x K2
            if W2_idx is not None:
                W2 = tf.gather(W2, W2_idx)

        ND = tf.cast(tf.size(Y), settings.float_type)
        D = tf.cast(tf.shape(Y)[1], settings.float_type)
        sigma2 = self.likelihood.variance

        # Get kernel terms
        # Expand if necessary?
        Kdiag = self.kern.Kdiag(X, full_output_cov=False)
        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        W = _expand_W(W1, W2)
        # compute statistics (potentially on minibatch)
        psi0 = self._psi0(Kdiag)
        psi1 = self._psi1(Kux, W)  # K x M x N
        psi2 = self._psi2(Kux, W)  # K x M x M

        L = tf.cholesky(Kuu)

        A = tf.matrix_triangular_solve(L, psi1)  # K x M x N
        a = tf.matrix_triangular_solve(L,
                                       tf.transpose(
                                           self.q_mu)[:, :, None])  # K x M x 1
        mean = tf.matmul(A, a, transpose_a=True)

        tmp1 = tf.matrix_triangular_solve(L, psi2)
        B = tf.matrix_triangular_solve(L, tf.transpose(tmp1, perm=[0, 2, 1]))

        tmp2 = tf.matrix_triangular_solve(L, self.q_sqrt)
        C = tf.matmul(tmp2, tmp2, transpose_b=True)

        # compute KL
        KL1 = self.build_prior_KL()
        KL2 = self.build_prior_assignment_KL(W1_idx, W2_idx)

        # compute log marginal bound
        bound = -0.5 * ND * tf.log(2 * np.pi * sigma2)
        bound += -0.5 * tf.reduce_sum(tf.square(Y)) / sigma2
        bound += tf.reduce_sum(Y * mean) / sigma2
        bound += -0.5 * tf.reduce_sum(
            tf.matmul(a, tf.matmul(B, a), transpose_a=True)) / sigma2
        bound += -0.5 * D * (psi0 - tf.reduce_sum(tf.matrix_diag_part(B)))
        bound += -0.5 * tf.reduce_sum(tf.einsum('kmp,kpm->km', B, C))

        if self.minibatch_size is not None:
            scale = self.num_data / self.minibatch_size
            bound *= scale

        bound -= KL2
        bound -= KL1

        return bound
    def _build_likelihood(self):
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood. For a derivation of the terms in here, see the associated
        SGPR notebook.
        """
        Y = self.Y
        X = self.X
        idx = self.idx

        W1_idx = self.W1_idx
        W2_idx = self.W2_idx

        if W1_idx is None:
            W1_idx = idx
        if W2_idx is None:
            W2_idx = idx

        if self.minibatch_size is not None:
            W1 = tf.gather(self.W1, W1_idx)
            W1 = tf.reshape(W1, [-1, self.K1])
            W1 = normalize(W1)

            W2 = tf.gather(self.W2, W2_idx)
            W2 = tf.reshape(W2, [-1, self.K2])
            W2 = normalize(W2)

        else:
            W1 = normalize(self.W1)  # N x K1
            if W1_idx is not None:
                W1 = tf.gather(W1, W1_idx)
            W2 = normalize(self.W2)  # N x K2
            if W2_idx is not None:
                W2 = tf.gather(W2, W2_idx)

        ND = tf.cast(tf.size(Y), settings.float_type)
        D = tf.cast(tf.shape(Y)[1], settings.float_type)
        sigma2 = self.likelihood.variance

        # Get kernel terms
        # Expand if necessary?
        Kdiag = self.kern.Kdiag(X, full_output_cov=False)
        Kux = features.Kuf(self.feature, self.kern, X)
        Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter)

        W = _expand_W(W1, W2)

        # compute KL
        KL1 = self.build_prior_KL()
        KL2 = self.build_prior_assignment_KL(
            tf.unique(W1_idx)[0],
            tf.unique(W2_idx)[0])

        fmean, fvar = self._build_predict(X,
                                          full_cov=False,
                                          full_output_cov=False)
        var_exp = self.likelihood.variational_expectations(fmean, fvar, Y)
        scale = self.num_data / self.minibatch_size

        bound = tf.reduce_sum(W * var_exp)
        bound *= scale
        bound -= KL1 + KL2
        return bound