def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = features.Kuf(self.feature, self.kern, self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.numerics.jitter_level) Kus = features.Kuf(self.feature, self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye( num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) var = tf.tile(var[None, ...], [self.num_latent, 1, 1]) # P x N x N else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) var = tf.tile(var[:, None], [1, self.num_latent]) return mean + self.mean_function(Xnew), var
def test_equivalence_inducing_points(self): # Multiscale must be equivalent to inducing points when variance is zero with self.test_context() as session: rbf, feature_0lengthscale, feature_inducingpoint = self.prepare() Xnew = np.random.randn(13, 3) ms, point = session.run([features.Kuf(feature_0lengthscale, rbf, Xnew), features.Kuf(feature_inducingpoint, rbf, Xnew)]) pd = np.max(np.abs(ms - point) / point * 100) self.assertTrue(pd < 0.1) ms, point = session.run([features.Kuu(feature_0lengthscale, rbf), features.Kuu(feature_inducingpoint, rbf)]) pd = np.max(np.abs(ms - point) / point * 100) self.assertTrue(pd < 0.1)
def compute_qu_external(self, X, Y, W1_idx, W2_idx): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W1 = tf.gather(normalize(self.W1), W1_idx) # N x K W2 = tf.gather(normalize(self.W2), W2_idx) W = _expand_W(W1, W2) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1) P = tf.einsum('kmn,nd->kmd', tmp, Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu[:, :, 0], tf.cholesky(A)
def update_W_external(self, X, Y): Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance A = tf.cholesky_solve(L, Kux) # K x M x N mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True) err = (Y - mean) reg1 = tf.reduce_sum( tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2) reg2 = tf.transpose(Kdiag) - tf.einsum('kmn,kmn->kn', A, Kux) logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \ - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 + tf.log(self.W_prior)[:, None] logW = logW - tf.reduce_logsumexp(logW, axis=0, keepdims=True) return tf.transpose(logW)
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kux = features.Kuf(self.feature, self.kern, self.X) psi1 = self._psi1(tf.transpose(Kux)) # K x N x M psi2 = self._psi2(tf.transpose(Kux)) # K x M x M Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Kuu = tf.tile(Kuu[None], [self.W.shape[1], 1, 1]) Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, tf.transpose(psi1, perm=[0, 2, 1])) P = tf.einsum('kmn,nd->kmd', tmp, self.Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu, A
def compute_upper_bound(self): num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) Kdiag = self.kern.Kdiag(self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Kuf = features.Kuf(self.feature, self.kern, self.X) L = tf.cholesky(Kuu) LB = tf.cholesky(Kuu + self.likelihood.variance**-1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True) # Using the Trace bound, from Titsias' presentation c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf**2.0) # Kff = self.kern.K(self.X) # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True) # Alternative bound on max eigenval: # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0)) corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance) logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum( tf.log(tf.diag_part(LB))) LC = tf.cholesky(Kuu + corrected_noise**-1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) v = tf.matrix_triangular_solve(LC, corrected_noise**-1.0 * tf.matmul(Kuf, self.Y), lower=True) quad = -0.5 * corrected_noise**-1.0 * tf.reduce_sum( self.Y**2.0) + 0.5 * tf.reduce_sum(v**2.0) return const + logdet + quad
def test_matrix_psd(self): # Conditional variance must be PSD. X = np.random.randn(13, 2) def init_feat(feature): if feature is gpflow.features.InducingPoints: return feature(np.random.randn(71, 2)) elif feature is gpflow.features.Multiscale: return feature(np.random.randn(71, 2), np.random.rand(71, 2)) featkerns = [(gpflow.features.InducingPoints, gpflow.kernels.RBF), (gpflow.features.InducingPoints, gpflow.kernels.Matern12), (gpflow.features.Multiscale, gpflow.kernels.RBF)] for feat_class, kern_class in featkerns: with self.test_context() as session: # rbf, feature, feature_0lengthscale, feature_inducingpoint = self.prepare() kern = kern_class(2, 1.84, lengthscales=[0.143, 1.53]) feature = init_feat(feat_class) Kuf, Kuu = session.run([ features.Kuf(feature, kern, X), features.Kuu(feature, kern, jitter=settings.jitter) ]) Kff = kern.compute_K_symm(X) Qff = Kuf.T @ np.linalg.solve(Kuu, Kuf) self.assertTrue(np.all(np.linalg.eig(Kff - Qff)[0] > 0.0))
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M Sig = Kuu + (self.likelihood.variance**-1) * psi2 Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) tmp = tf.matrix_triangular_solve(Sig_sqrt, psi1) P = tf.einsum('kmn,nd->kmd', tmp, Y) mu = tf.matmul(Sig_sqrt_Kuu, P, transpose_a=True) * self.likelihood.variance**-1.0 return mu[:, :, 0], tf.cholesky(A)
def update_W_external(self, X, Y, W1_idx, W2_idx): Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance A = tf.cholesky_solve(L, Kux) # K x M x N mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True) err = (Y - mean) reg1 = tf.reduce_sum( tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2) reg2 = tf.transpose(Kdiag) - \ tf.einsum('kmn,kmn->kn', A, Kux) logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \ - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 logW = tf.reshape(logW, [self.K1, self.K2, -1]) # compute new W1 W2 = tf.gather(normalize(self.W2), W2_idx) logW1 = tf.reduce_sum( logW * tf.transpose(W2)[None, :, :], axis=1) #logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=0, keepdims=True) # group by index logW1_parts = tf.dynamic_partition( tf.transpose(logW1), W1_idx, num_partitions=self.W1.shape[0]) logW1 = tf.stack([ tf.reduce_sum(part, axis=0) for part in logW1_parts]) logW1 = logW1 + tf.log(self.W1_prior)[None] logW1 = logW1 - tf.reduce_logsumexp(logW1, axis=1, keepdims=True) # compute new W2 W1 = tf.gather(normalize(logW1), W1_idx) logW2 = tf.reduce_sum( logW * tf.transpose(W1)[:, None, :], axis=0) #logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=0, keepdims=True) # group by index logW2_parts = tf.dynamic_partition( tf.transpose(logW2), W2_idx, num_partitions=self.W2.shape[0]) logW2 = tf.stack([ tf.reduce_sum(part, axis=0) for part in logW2_parts]) logW2 = logW2 + tf.log(self.W2_prior)[None] logW2 = logW2 - tf.reduce_logsumexp(logW2, axis=1, keepdims=True) return logW1, logW2
def compute_qu(self): """ Computes the mean and variance of q(u), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. :return: mu, A """ Kuf = features.Kuf(self.feature, self.kern, self.X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) Sig = Kuu + (self.likelihood.variance**-1) * tf.matmul( Kuf, Kuf, transpose_b=True) Sig_sqrt = tf.cholesky(Sig) Sig_sqrt_Kuu = tf.matrix_triangular_solve(Sig_sqrt, Kuu) A = tf.matmul(Sig_sqrt_Kuu, Sig_sqrt_Kuu, transpose_a=True) mu = tf.matmul( Sig_sqrt_Kuu, tf.matrix_triangular_solve( Sig_sqrt, tf.matmul(Kuf, self.Y - self.mean_function(self.X))), transpose_a=True) * self.likelihood.variance**-1.0 return mu, A
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) ND = tf.cast(tf.size(Y), settings.float_type) D = tf.cast(tf.shape(Y)[1], settings.float_type) sigma2 = self.likelihood.variance # Get kernel terms # Expand if necessary? Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) # compute statistics (potentially on minibatch) psi0 = self._psi0(Kdiag) psi1 = self._psi1(Kux, W) # K x M x N psi2 = self._psi2(Kux, W) # K x M x M L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, psi1) # K x M x N a = tf.matrix_triangular_solve(L, tf.transpose( self.q_mu)[:, :, None]) # K x M x 1 mean = tf.matmul(A, a, transpose_a=True) tmp1 = tf.matrix_triangular_solve(L, psi2) B = tf.matrix_triangular_solve(L, tf.transpose(tmp1, perm=[0, 2, 1])) tmp2 = tf.matrix_triangular_solve(L, self.q_sqrt) C = tf.matmul(tmp2, tmp2, transpose_b=True) # compute KL KL1 = self.build_prior_KL() KL2 = self.build_prior_assignment_KL(W1_idx, W2_idx) # compute log marginal bound bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * tf.reduce_sum(tf.square(Y)) / sigma2 bound += tf.reduce_sum(Y * mean) / sigma2 bound += -0.5 * tf.reduce_sum( tf.matmul(a, tf.matmul(B, a), transpose_a=True)) / sigma2 bound += -0.5 * D * (psi0 - tf.reduce_sum(tf.matrix_diag_part(B))) bound += -0.5 * tf.reduce_sum(tf.einsum('kmp,kpm->km', B, C)) if self.minibatch_size is not None: scale = self.num_data / self.minibatch_size bound *= scale bound -= KL2 bound -= KL1 return bound
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ Y = self.Y X = self.X idx = self.idx W1_idx = self.W1_idx W2_idx = self.W2_idx if W1_idx is None: W1_idx = idx if W2_idx is None: W2_idx = idx if self.minibatch_size is not None: W1 = tf.gather(self.W1, W1_idx) W1 = tf.reshape(W1, [-1, self.K1]) W1 = normalize(W1) W2 = tf.gather(self.W2, W2_idx) W2 = tf.reshape(W2, [-1, self.K2]) W2 = normalize(W2) else: W1 = normalize(self.W1) # N x K1 if W1_idx is not None: W1 = tf.gather(W1, W1_idx) W2 = normalize(self.W2) # N x K2 if W2_idx is not None: W2 = tf.gather(W2, W2_idx) ND = tf.cast(tf.size(Y), settings.float_type) D = tf.cast(tf.shape(Y)[1], settings.float_type) sigma2 = self.likelihood.variance # Get kernel terms # Expand if necessary? Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) W = _expand_W(W1, W2) # compute KL KL1 = self.build_prior_KL() KL2 = self.build_prior_assignment_KL( tf.unique(W1_idx)[0], tf.unique(W2_idx)[0]) fmean, fvar = self._build_predict(X, full_cov=False, full_output_cov=False) var_exp = self.likelihood.variational_expectations(fmean, fvar, Y) scale = self.num_data / self.minibatch_size bound = tf.reduce_sum(W * var_exp) bound *= scale bound -= KL1 + KL2 return bound