def build_prior_KL(self): KL = tf.Variable(0, name='KL', trainable=False, dtype=float_type) for d in range(self.D): K = self.kerns[d].K(self.X) K_alpha = tf.matmul(K, self.q_alpha[d, :, :]) f_mean = K_alpha + self.mean_functions[d](self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(eye(self.num_data), 0), [self.num_latent.value, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda[d,:,:]), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda[d,:,:]), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda[d, :, :]), 1) f_var = 1. / tf.square(self.q_lambda[d, :, :]) - tf.transpose( tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL += 0.5 * (A_logdet + trAi - self.num_data.value * self.num_latent.value + tf.reduce_sum(K_alpha * self.q_alpha[d, :, :])) return KL
def gauss_kl_diag(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[1], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square( tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.size(q_sqrt), float_type) # constant term KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum( tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def build_prior_KL(self): KL = tf.Variable(0, name='KL', trainable=False, dtype=float_type) for i in range(self.D): if self.whiten: if self.q_diag: KL += gauss_kl_white_diag(self.q_mu[i], self.q_sqrt[i]) else: KL += gauss_kl_white(self.q_mu[i], self.q_sqrt[i]) else: K = self.kerns[i].K(self.Zs[self.f_indices[i]]) + eye( self.num_inducing[i]) * jitter_level if self.q_diag: KL += gauss_kl_diag(self.q_mu[i], self.q_sqrt[i], K) else: KL += gauss_kl(self.q_mu[i], self.q_sqrt[i], K) return KL
def build_prior_KL(self): S = np.square(self.s) * eye(self.D) # diagonal prior v*I KL = gauss_kl_diag(self.q_A_mu, self.q_A_sqrt, S) return KL
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size N x D - X are data points, size M x D - kern is a GPflow kernel - f is a data matrix, M x K, representing the function values at X, for K functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * jitter_level Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(A, A, transpose_a=True) shape = tf.stack([tf.shape(f)[1], 1, 1]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) shape = tf.stack([tf.shape(f)[1], 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # D x N x N or D x N # another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # D x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([tf.shape(f)[1], 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # D x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # D x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # D x N fvar = tf.transpose(fvar) # N x D or N x N x D return fmean, fvar