def update_W_external(self, X, Y): Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance A = tf.cholesky_solve(L, Kux) # K x M x N mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True) err = (Y - mean) reg1 = tf.reduce_sum( tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2) reg2 = tf.transpose(Kdiag) - tf.einsum('kmn,kmn->kn', A, Kux) logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \ - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 + tf.log(self.W_prior)[:, None] logW = logW - tf.reduce_logsumexp(logW, axis=0, keepdims=True) return tf.transpose(logW)
def update_W_external(self, X, Y, W1_idx, W2_idx): Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance A = tf.cholesky_solve(L, Kux) # K x M x N mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True) err = (Y - mean) reg1 = tf.reduce_sum( tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2) reg2 = tf.transpose(Kdiag) - \ tf.einsum('kmn,kmn->kn', A, Kux) logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \ - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 logW = tf.reshape(logW, [self.K1, self.K2, -1]) # compute new W1 W2 = tf.gather(normalize(self.W2), W2_idx) logW1 = tf.reduce_sum( logW * tf.transpose(W2)[None, :, :], axis=1) #logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=0, keepdims=True) # group by index logW1_parts = tf.dynamic_partition( tf.transpose(logW1), W1_idx, num_partitions=self.W1.shape[0]) logW1 = tf.stack([ tf.reduce_sum(part, axis=0) for part in logW1_parts]) logW1 = logW1 + tf.log(self.W1_prior)[None] logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=1, keepdims=True) # compute new W2 W1 = tf.gather(normalize(logW1), W1_idx) logW2 = tf.reduce_sum( logW * tf.transpose(W1)[:, None, :], axis=0) #logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=0, keepdims=True) # group by index logW2_parts = tf.dynamic_partition( tf.transpose(logW2), W2_idx, num_partitions=self.W2.shape[0]) logW2 = tf.stack([ tf.reduce_sum(part, axis=0) for part in logW2_parts]) logW2 = logW2 + tf.log(self.W2_prior)[None] logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=1, keepdims=True) return logW1, logW2
def test_equivalence_inducing_points(self): # Multiscale must be equivalent to inducing points when variance is zero with self.test_context() as session: rbf, feature_0lengthscale, feature_inducingpoint = self.prepare() Xnew = np.random.randn(13, 3) ms, point = session.run([features.Kuf(feature_0lengthscale, rbf, Xnew), features.Kuf(feature_inducingpoint, rbf, Xnew)]) pd = np.max(np.abs(ms - point) / point * 100) self.assertTrue(pd < 0.1) ms, point = session.run([features.Kuu(feature_0lengthscale, rbf), features.Kuu(feature_inducingpoint, rbf)]) pd = np.max(np.abs(ms - point) / point * 100) self.assertTrue(pd < 0.1)
def compute_upper_bound(self): num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) Kdiag = self.kern.Kdiag(self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Kuf = features.Kuf(self.feature, self.kern, self.X) L = tf.cholesky(Kuu) LB = tf.cholesky(Kuu + self.likelihood.variance**-1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True) # Using the Trace bound, from Titsias' presentation c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf**2.0) # Kff = self.kern.K(self.X) # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True) # Alternative bound on max eigenval: # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0)) corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance) logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum( tf.log(tf.diag_part(LB))) LC = tf.cholesky(Kuu + corrected_noise**-1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) v = tf.matrix_triangular_solve(LC, corrected_noise**-1.0 * tf.matmul(Kuf, self.Y), lower=True) quad = -0.5 * corrected_noise**-1.0 * tf.reduce_sum( self.Y**2.0) + 0.5 * tf.reduce_sum(v**2.0) return const + logdet + quad
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = features.Kuf(self.feature, self.kern, self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.numerics.jitter_level) Kus = features.Kuf(self.feature, self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + \ tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) var = tf.tile(var[None, ...], [self.num_latent, 1, 1]) # P x N x N else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) var = tf.tile(var[:, None], [1, self.num_latent]) return mean + self.mean_function(Xnew), var
def test_matrix_psd(self): # Conditional variance must be PSD. X = np.random.randn(13, 2) def init_feat(feature): if feature is gpflow.features.InducingPoints: return feature(np.random.randn(71, 2)) elif feature is gpflow.features.Multiscale: return feature(np.random.randn(71, 2), np.random.rand(71, 2)) featkerns = [(gpflow.features.InducingPoints, gpflow.kernels.RBF), (gpflow.features.InducingPoints, gpflow.kernels.Matern12), (gpflow.features.Multiscale, gpflow.kernels.RBF)] for feat_class, kern_class in featkerns: with self.test_context() as session: # rbf, feature, feature_0lengthscale, feature_inducingpoint = self.prepare() kern = kern_class(2, 1.84, lengthscales=[0.143, 1.53]) feature = init_feat(feat_class) Kuf, Kuu = session.run([ features.Kuf(feature, kern, X), features.Kuu(feature, kern, jitter=settings.jitter) ]) Kff = kern.compute_K_symm(X) Qff = Kuf.T @ np.linalg.solve(Kuu, Kuf) self.assertTrue(np.all(np.linalg.eig(Kff - Qff)[0] > 0.0))
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kux = features.Kuf(self.feature, self.kern, self.X) psi1 = self._psi1(tf.transpose(Kux)) # K x N x M psi2 = self._psi2(tf.transpose(Kux)) # K x M x M Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Kuu = tf.tile(Kuu[None], [self.W.shape[1], 1, 1]) Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, tf.transpose(psi1, perm=[0, 2, 1])) P = tf.einsum('kmn,nd->kmd', tmp, self.Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu, A
def update_W(self): Kxu = self.kern.K(self.X, self.feature.Z) psi1 = self._psi1(Kxu) # K x L x N x M psi2 = self._psi2(Kxu) # K x L x M x M # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(Kxu), lower=True) / sigma # Kxu = tf.tile(Kxu[None, None], [self.K, self.L, 1, 1]) L = tf.tile(L[None, None], [self.K, self.L, 1, 1]) # K x L x M x M A = tf.tile(A[None, None], [self.K, self.L, 1, 1]) # K x L X M x N Apsi = tf.matrix_triangular_solve( L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve( L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2 B = AAT + tf.tile( tf.eye(self.num_inducing, dtype=settings.float_type)[None, None], [self.K, self.L, 1, 1]) LB = tf.cholesky(B) LBinvA = tf.matrix_triangular_solve(LB, A) # K x L x M x N LBinvApsi = tf.matrix_triangular_solve(LB, Apsi) # K x L x M x N err = self.Y[None] - tf.matmul(tf.transpose(LBinvA, perm=[0, 1, 3, 2]), tf.einsum('klmn,nd->klmd', LBinvApsi, self.Y)) # K x L x N err = tf.squeeze(err) reg1 = tf.reduce_sum(tf.pow(LBinvA, 2), axis=2) # K x L x N reg2 = self.kern.Kdiag(self.X) / sigma2 reg2 = reg2 - tf.reduce_sum(tf.pow(A, 2), axis=2) # K x L x N logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.pow(err, 2) / sigma2 \ - 0.5 * reg1 \ - 0.5 * reg2 \ + tf.transpose(tf.log(self.W1_prior))[:, None] \ + tf.transpose(tf.log(self.W2_prior))[None] logW1 = tf.reduce_sum(logW * tf.transpose(normalize(self.W2))[None, :, :], axis=1) logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=0, keepdims=True) logW2 = tf.reduce_sum(logW * tf.transpose(normalize(self.W1))[:, None, :], axis=0) logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=0, keepdims=True) return tf.transpose(logW1), tf.transpose(logW2)
def compute_qu_external(self, X, Y, W1_idx, W2_idx): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W1 = tf.gather(normalize(self.W1), W1_idx) # N x K W2 = tf.gather(normalize(self.W2), W2_idx) W = _expand_W(W1, W2) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1) P = tf.einsum('kmn,nd->kmd', tmp, Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu[:, :, 0], tf.cholesky(A)
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kxu = self.kern.K(self.X, self.feature.Z) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Kuu = tf.tile(Kuu[None, None], [self.K, self.L, 1, 1]) psi1 = self._psi1(Kxu) # K x L x N x M psi2 = self._psi2(Kxu) # K x L x M x M Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) mu = tf.einsum( 'klmn,nd->klmd', tf.matrix_triangular_solve(Sig_sqrt, tf.transpose(psi1, perm=[0, 1, 3, 2])), self.Y) mu = tf.matmul(tf.transpose(Sig_sqrt_Kuu, perm=[0, 1, 3, 2]), mu) * (self.likelihood.variance**-1) return mu, A
def build_prior_KL(self): if self.whiten: K = None else: K = features.Kuu( self.feature, self.kern, jitter=settings.numerics.jitter_level) # (P x) x M x M return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K)
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ Kxu = self.kern.K(self.X, self.feature.Z) Ksu = self.kern.K(Xnew, self.feature.Z) Ksu = tf.tile(Ksu[None, None], [self.K, self.L, 1, 1]) psi0 = self._psi0() # scalar psi1 = self._psi1(Kxu) # K x L X N x M psi2 = self._psi2(Kxu) # K x L x M x M # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) L = tf.tile(L[None, None], [self.K, self.L, 1, 1]) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve( L, tf.transpose(Ksu, perm=[0, 1, 3, 2]), lower=True) / sigma Apsi = tf.matrix_triangular_solve( L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve( L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2 B = AAT + tf.tile( tf.eye(self.num_inducing, dtype=settings.float_type)[None, None], [self.K, self.L, 1, 1]) LB = tf.cholesky(B) mu = tf.einsum('klmn,nd->klmd', tf.matrix_triangular_solve(LB, Apsi), self.Y) mu = tf.matmul( tf.transpose(tf.matrix_triangular_solve(LB, A), perm=[0, 1, 3, 2]), mu) LBinvA = tf.matrix_triangular_solve(LB, A) if full_cov: var = self.kern.K(Xnew, Xnew) var -= tf.matmul(A, A, transpose_a=True) * sigma2 var += tf.matmul(LBinvA, LBinvA, transpose_a=True) * sigma2 else: var = self.kern.Kdiag(Xnew)[None, None] var -= tf.reduce_sum(A**2, axis=2) * sigma2 var += tf.reduce_sum(LBinvA**2, axis=2) * sigma2 return mu, var
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ ND = tf.cast(tf.size(self.Y), settings.float_type) D = tf.cast(tf.shape(self.Y)[1], settings.float_type) Kxu = self.kern.K(self.X, self.feature.Z) psi0 = self._psi0() # scalar psi1 = self._psi1(Kxu) # K x L X N x M psi2 = self._psi2(Kxu) # K x L x M x M # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) L = tf.tile(L[None, None], [self.K, self.L, 1, 1]) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve( L, tf.transpose(psi1, perm=[0, 1, 3, 2]), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve( L, tf.transpose(tmp, perm=[0, 1, 3, 2]), lower=True) / sigma2 B = AAT + tf.tile( tf.eye(self.num_inducing, dtype=settings.float_type)[None, None], [self.K, self.L, 1, 1]) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) c = tf.matrix_triangular_solve( LB, tf.einsum('klmn,nd->klmd', A, self.Y), lower=True) / sigma # KL[q(W) || p(W)] W1norm = normalize(self.W1) W2norm = normalize(self.W2) KL = tf.reduce_sum(W1norm * (tf.log(W1norm) - self.W1_prior[None])) KL += tf.reduce_sum(W2norm * (tf.log(W2norm) - self.W2_prior[None])) # compute log marginal bound bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.matrix_diag_part(AAT))) bound -= KL return bound
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1) P = tf.einsum('kmn,nd->kmd', tmp, Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu[:, :, 0], tf.cholesky(A)
def test_inducing_points_equivalence(self): # Inducing features must be the same as the kernel evaluations with self.test_context() as session: Z = np.random.randn(101, 3) f = features.InducingPoints(Z) kernels = [ gpflow.kernels.RBF(3, 0.46, lengthscales=np.array([0.143, 1.84, 2.0]), ARD=True), gpflow.kernels.Periodic(3, 0.4, 1.8) ] for k in kernels: self.assertTrue(np.allclose(session.run(features.Kuu(f, k)), k.compute_K_symm(Z)))
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ ND = tf.cast(tf.size(self.Y), settings.float_type) D = tf.cast(tf.shape(self.Y)[1], settings.float_type) Kxu = self.kern.K(self.X, self.feature.Z) psi0 = self._psi0() psi1 = self._psi1(Kxu) psi2 = self._psi2(Kxu) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) L = block_diagonal([L for _ in range(self.W.shape[1])]) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(self.num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(W) || p(W)] KL = tf.reduce_sum(self.Wnorm() * (tf.log(self.Wnorm()) - tf.log(self.W_prior))) # compute log marginal bound bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.matrix_diag_part(AAT))) bound -= KL return bound
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kuf = features.Kuf(self.feature, self.kern, self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Sig = Kuu + (self.likelihood.variance**-1) * tf.matmul( Kuf, Kuf, transpose_b=True) Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) mu = tf.matmul( Sig_sqrt_Kuu, tf.matrix_triangular_solve( Sig_sqrt, tf.matmul(Kuf, self.Y - self.mean_function(self.X))), transpose_a=True) * self.likelihood.variance**-1.0 return mu, A
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) ND = tf.cast(tf.size(Y), settings.float_type) D = tf.cast(tf.shape(Y)[1], settings.float_type) sigma2 = self.likelihood.variance # Get kernel terms # Expand if necessary? Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) # compute statistics (potentially on minibatch) psi0 = self._psi0(Kdiag) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, psi1) # K x M x N a = tf.matrix_triangular_solve(L, tf.transpose( self.q_mu)[:, :, None]) # K x M x 1 mean = tf.matmul(A, a, transpose_a=True) tmp1 = tf.matrix_triangular_solve(L, psi2) B = tf.matrix_triangular_solve(L, tf.transpose(tmp1, perm=[0, 2, 1])) tmp2 = tf.matrix_triangular_solve(L, self.q_sqrt) C = tf.matmul(tmp2, tmp2, transpose_b=True) # compute KL KL1 = self.build_prior_KL() KL2 = self.build_prior_assignment_KL(W1_idx, W2_idx) # compute log marginal bound bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * tf.reduce_sum(tf.square(Y)) / sigma2 bound += tf.reduce_sum(Y * mean) / sigma2 bound += -0.5 * tf.reduce_sum( tf.matmul(a, tf.matmul(B, a), transpose_a=True)) / sigma2 bound += -0.5 * D * (psi0 - tf.reduce_sum(tf.matrix_diag_part(B))) bound += -0.5 * tf.reduce_sum(tf.einsum('kmp,kpm->km', B, C)) if self.minibatch_size is not None: scale = self.num_data / self.minibatch_size bound *= scale bound -= KL2 bound -= KL1 return bound
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) ND = tf.cast(tf.size(Y), settings.float_type) D = tf.cast(tf.shape(Y)[1], settings.float_type) sigma2 = self.likelihood.variance # Get kernel terms # Expand if necessary? Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) # compute KL KL1 = self.build_prior_KL() KL2 = self.build_prior_assignment_KL( tf.unique(W1_idx)[0], tf.unique(W2_idx)[0]) fmean, fvar = self._build_predict(X, full_cov=False, full_output_cov=False) var_exp = self.likelihood.variational_expectations(fmean, fvar, Y) scale = self.num_data / self.minibatch_size bound = tf.reduce_sum(W * var_exp) bound *= scale bound -= KL1 + KL2 return bound