def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower lound on the likelihood, which is E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha, [K^-1 + diag(square(lambda))]^-1) """ K = self.kern.K(self.X) f_mean = tf.matmul(K, self.q_alpha) + self.mean_function(self.X) #for each of the data-dimensions (columns of Y), find the diagonal of the #variance, and also relevant parts of the KL. f_var, A_logdet, trAi = [], tf.zeros((1,), tf.float64), tf.zeros((1,), tf.float64) for d in range(self.num_latent): b = self.q_lambda[:,d] B = tf.expand_dims(b, 1) A = eye(self.num_data) + K*B*tf.transpose(B) L = tf.cholesky(A) Li = tf.user_ops.triangular_solve(L, eye(self.num_data), 'lower') LiBi = Li / b #full_sigma:return tf.diag(b**-2) - LiBi.T.dot(LiBi) f_var.append(1./tf.square(b) - tf.reduce_sum(tf.square(LiBi),0)) A_logdet += 2*tf.reduce_sum(tf.log(tf.user_ops.get_diag(L))) trAi += tf.reduce_sum(tf.square(Li)) f_var = tf.transpose(tf.pack(f_var)) KL = 0.5*(A_logdet + trAi - self.num_data*self.num_latent + tf.reduce_sum(f_mean*self.q_alpha)) return tf.reduce_sum(self.likelihood.variational_expectations(f_mean, f_var, self.Y)) - KL
def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower lound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) f_mean = tf.matmul(K, self.q_alpha) + self.mean_function(self.X) #for each of the data-dimensions (columns of Y), find the diagonal of the #variance, and also relevant parts of the KL. f_var, A_logdet, trAi = [], tf.zeros((1,), tf.float64), tf.zeros((1,), tf.float64) for d in range(self.num_latent): b = self.q_lambda[:,d] B = tf.expand_dims(b, 1) A = eye(self.num_data) + K*B*tf.transpose(B) L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, eye(self.num_data), lower=True) LiBi = Li / b #full_sigma:return tf.diag(b**-2) - LiBi.T.dot(LiBi) f_var.append(1./tf.square(b) - tf.reduce_sum(tf.square(LiBi),0)) A_logdet += 2*tf.reduce_sum(tf.log(tf.user_ops.get_diag(L))) trAi += tf.reduce_sum(tf.square(Li)) f_var = tf.transpose(tf.pack(f_var)) KL = 0.5*(A_logdet + trAi - self.num_data*self.num_latent + tf.reduce_sum(f_mean*self.q_alpha)) return tf.reduce_sum(self.likelihood.variational_expectations(f_mean, f_var, self.Y)) - KL
def build_prior_KL(self): """ We return the KL for all latent funtions """ KL = 0 for i in np.arange(self.rank): # i is the group id. for j in np.arange(self.num_latent_list[i]): lat_id = np.sum(self.num_latent_list[:i],dtype = np.int64) + j #id of latent function if self.whiten_list[lat_id]: if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_white_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], self.dim)#rotates the coordinate system to make it independent else: KL += kullback_leiblers.gauss_kl_white(self.q_mu_list[lat_id],self.q_sqrt_list[lat_id], self.dim) else: K = self.kern_list[i].K(self.Z[lat_id]) + eye(self.num_inducing_list[lat_id]) * 1e-6 ## compute with the ith kernel if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], K, self.dim) else: KL += kullback_leiblers.gauss_kl(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], K, self.dim) if self.tsk: for task_id in np.arange(self.num_tasks): lat_id = np.sum(self.num_latent_list,dtype = np.int64) + task_id#id of latent function if self.whiten_list[lat_id]: if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_white_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], self.dim) #rotates the coordinate system to make it independent else: KL += kullback_leiblers.gauss_kl_white(self.q_mu_list[lat_id],self.q_sqrt_list[lat_id], self.dim) else: K = self.tskern_list[task_id].K(self.Z[lat_id]) + eye(self.num_inducing_list[lat_id]) * 1e-6 ## compute with the ith kernel if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id],K, self.dim) else: KL += kullback_leiblers.gauss_kl(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id],K, self.dim) return KL
def build_prior_KL(self): """ We return the KL for all latent funtions """ KL = 0 for q in np.arange(self.rank): # q is the group id. for i in np.arange(self.num_latent_list[q]): lat_id = np.sum(self.num_latent_list[:q],dtype = np.int64) + i #id of latent function if self.whiten_list[lat_id]: if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_white_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], self.dim) else: KL += kullback_leiblers.gauss_kl_white(self.q_mu_list[lat_id],self.q_sqrt_list[lat_id], self.dim) else: K = self.kern_list[q].K(self.Z[lat_id]) + eye(self.num_inducing_list[lat_id]) * 1e-6 if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], K, self.dim) else: KL += kullback_leiblers.gauss_kl(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], K, self.dim) if self.tsk: for d in np.arange(self.num_tasks): lat_id = np.sum(self.num_latent_list,dtype = np.int64) + d#id of latent function if self.whiten_list[lat_id]: if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_white_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id], self.dim) else: KL += kullback_leiblers.gauss_kl_white(self.q_mu_list[lat_id],self.q_sqrt_list[lat_id], self.dim) else: K = self.tskern_list[d].K(self.Z[lat_id]) + eye(self.num_inducing_list[lat_id]) * 1e-6 ## compute with the ith kernel if self.q_diag_list[lat_id]: KL += kullback_leiblers.gauss_kl_diag(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id],K, self.dim) else: KL += kullback_leiblers.gauss_kl(self.q_mu_list[lat_id], self.q_sqrt_list[lat_id],K, self.dim) return KL
def build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. Note that this is very similar to the SGPR prediction, for whcih there are notes in the SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] psi0, psi1, psi2 = ke.build_psi_stats(self.Z, self.kern, self.X_mean, self.X_var) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 Kus = self.kern.K(self.Z, Xnew) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def build_prior_KL(self): KL = None for d in xrange(self.X.shape[1]): q_mu_d = self.__getattribute__('q_mu_%d' % d) q_sqrt_d = self.__getattribute__('q_sqrt_%d' % d) Z_d = self.__getattribute__('Z_%d' % d) if self.whiten: if self.q_diag: KL_d = kullback_leiblers.gauss_kl_white_diag(q_mu_d, q_sqrt_d, self.num_latent) else: KL_d = kullback_leiblers.gauss_kl_white(q_mu_d, q_sqrt_d, self.num_latent) else: K = self.kern[d].K(Z_d) + eye(self.num_inducing[d]) * 1e-6 if self.q_diag: KL_d = kullback_leiblers.gauss_kl_diag(q_mu_d, q_sqrt_d, K, self.num_latent) else: KL_d = kullback_leiblers.gauss_kl(q_mu_d, q_sqrt_d, K, self.num_latent) # add things up, we were too lazy to check the type of KL_d if KL is None: KL = KL_d else: KL += KL_d return KL
def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(tf.shape(self.X)[0]) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, tf.shape(self.Y)[1]]) return fmean, fvar
def gauss_kl_diag(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum(tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X), lower=True) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) fvar = tf.tile(tf.expand_dims(fvar, 2), tf.pack([1, 1, tf.shape(self.Y)[1]])) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), reduction_indices=0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, self.Y.shape[1]]) return fmean, fvar
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ num_inducing = tf.shape(self.Z)[0] psi0, psi1, psi2 = ke.build_psi_stats(self.Z, self.kern, self.X_mean, self.X_var) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(x) || p(x)] NQ = tf.cast(tf.size(self.X_mean), tf.float64) D = tf.cast(tf.shape(self.Y)[1], tf.float64) KL = -0.5*tf.reduce_sum(tf.log(self.X_var)) \ + 0.5*tf.reduce_sum(tf.log(self.X_prior_var))\ - 0.5 * NQ\ + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + self.X_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(self.Y), tf.float64) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.diag_part(AAT))) bound -= KL return bound
def build_likelihood(self): """ Construct a tensorflow function to compute the likelihood. \log p(Y, V | theta). """ K = self.kern.K(self.X) + eye(tf.shape(self.X)[0]) * self.likelihood.variance L = tf.cholesky(K) m = self.mean_function(self.X) return multivariate_normal(self.Y, m, L)
def build_likelihood(self): """ Constuct a tensorflow function to compute the likelihood of a general GP model. \log p(Y, V | theta). """ K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) m = self.mean_function(self.X) return multivariate_normal(self.Y, m, L)
def build_prior_KL(self): if self.whiten: if self.q_diag: KL = kullback_leiblers.gauss_kl_white_diag(self.q_mu, self.q_sqrt, self.num_latent) else: KL = kullback_leiblers.gauss_kl_white(self.q_mu, self.q_sqrt, self.num_latent) else: K = self.kern.K(self.Z) + eye(self.num_inducing) * 1e-6 if self.q_diag: KL = kullback_leiblers.gauss_kl_diag(self.q_mu, self.q_sqrt, K, self.num_latent) else: KL = kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K, self.num_latent) return KL
def predict_f_samples(self, Xnew, num_samples): """ Produce samples from the posterior latent function(s) at the points Xnew. """ mu, var = self.build_predict(Xnew, full_cov=True) jitter = tf_hacks.eye(tf.shape(mu)[0]) * 1e-6 samples = [] for i in range(self.num_latent): L = tf.cholesky(var[:, :, i] + jitter) shape = tf.pack([tf.shape(L)[0], num_samples]) V = tf.random_normal(shape, dtype=tf.float64) samples.append(mu[:, i:i + 1] + tf.matmul(L, V)) return tf.transpose(tf.pack(samples))
def build_prior_KL(self): if self.whiten: if self.q_diag: KL = kullback_leiblers.gauss_kl_white_diag( self.q_mu, self.q_sqrt, self.num_latent) else: KL = kullback_leiblers.gauss_kl_white(self.q_mu, self.q_sqrt, self.num_latent) else: K = self.kern.K(self.Z) + eye(self.num_inducing) * 1e-6 if self.q_diag: KL = kullback_leiblers.gauss_kl_diag(self.q_mu, self.q_sqrt, K, self.num_latent) else: KL = kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K, self.num_latent) return KL
def build_likelihood(self): """ Constuct a tensorflow function to compute the likelihood of a general GP model. \log p(Y, V | theta). """ with tf.name_scope('kernel'): K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance _ = tf.image_summary('k', tf.expand_dims(tf.expand_dims(tf.cast(K, tf.float32), 2), 0)) L = tf.cholesky(K) with tf.name_scope('mean_function'): m = self.mean_function(self.X) with tf.name_scope('mvn_density'): log_lik = multivariate_normal(self.Y, m, L) return log_lik
def build_predict(self, Xnew): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kd = self.kern.Kdiag(Xnew) Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) A = tf.user_ops.triangular_solve(L, Kx, 'lower') V = tf.user_ops.triangular_solve(L, self.Y - self.mean_function(self.X), 'lower') fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) fvar = Kd - tf.reduce_sum(tf.square(A), reduction_indices=0) return fmean, tf.tile(tf.reshape(fvar, (-1,1)), [1, self.Y.shape[1]])
def gp_predict(Xnew, X, kern, F): """ Given F, representing the GP at the points X, produce the mean and variance of the GP at the points Xnew. We assume K independent GPs, represented by the columns of F. This function computes the Gaussian conditional p(F* | F) Xnew is a data matrix, size N x D X are inducing points, size M x D F are function values , size M x K See also: gp_predict_whitened -- where F is rotated into V (F = LV) gaussian_gp_predict -- similar, but with uncertainty in F """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') B = tf.user_ops.triangular_solve(tf.transpose(Lm), A, 'upper') # B is Kmm^{-1} Kmn #construct the mean and variance of q(f*) fmean = tf.matmul(tf.transpose(B), F) fvar = Kdiag - tf.reduce_sum(tf.square(A), 0) fvar = tf.expand_dims(fvar, 1) return fmean, fvar
def gp_predict(Xnew, X, kern, F): """ Given F, representing the GP at the points X, produce the mean and variance of the GP at the points Xnew. We assume K independent GPs, represented by the columns of F. This function computes the Gaussian conditional p(F* | F) Xnew is a data matrix, size N x D X are inducing points, size M x D F are function values , size M x K See also: gp_predict_whitened -- where F is rotated into V (F = LV) gaussian_gp_predict -- similar, but with uncertainty in F """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data)*1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') B = tf.user_ops.triangular_solve(tf.transpose(Lm), A, 'upper') # B is Kmm^{-1} Kmn #construct the mean and variance of q(f*) fmean = tf.matmul(tf.transpose(B), F) fvar = Kdiag - tf.reduce_sum(tf.square(A), 0) fvar = tf.expand_dims(fvar, 1) return fmean, fvar
def K(self, X, X2=None): if X2 is None: return self.variance * eye(tf.shape(X)[0]) else: return tf.zeros(tf.pack([tf.shape(X)[0], tf.shape(X2)[0]]), tf.float64)
def gaussian_gp_predict_whitened(Xnew, X, kern, q_mu, q_sqrt, num_columns): """ Given an (approximate) posterior (via q_mu, q_sqrt) to the GP at the points X, produce the mean and variance of the GP at the points Xnew. Additionally, the GP has been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). We assume K independent GPs, represented by the columns of q_mu (and the last ax of q_sqrt). q_mu and q_sqrt are variational posteriors for v, So q(v[:,i]) = N( q_mu[:,i], diag(q_sqrt[:,i]**2) q(f[:,i]) = N (L q_mu[:,i], L diag(q_sqrt**2) L^T) or q(f[:,i]) = N (L q_mu, L [W W^T] L^T) where W is the lower triangle of q_sqrt[:,:,i]. This function computes the Gaussian integral q(f*) = \int p(f*|(f=Lv))q(v) df. Xnew is a data matrix, size N x D X are data points, size M x D q_mu are variational means, size M x K q_sqrt are variational standard-deviations or Cholesky matrices,, size M x K or M x M x K Note (and TODO): At the moment, num_columns only gets used for the q_sqrt.ndim==3 case, and it tells use the value of q_sqrt.shape()[2]. We need to find a way to get this from the tf graph. See also: gp_predict_whitened -- where there is no uncertainty in V gaussian_gp_predict -- same without the whitening """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') #construct the mean and variance of q(f) fmean = tf.matmul(tf.transpose(A), q_mu) if q_sqrt.get_shape().ndims == 2: #we hae a diagonal form for q(v) q_var = np.square(q_sqrt) #fvar = Kdiag[:,None] + tf.reduce_sum((tf.square(tf.transpose(A)))[:,:,None] * (q_var[None, :,:] - 1),1) fvar = tf.reshape(Kdiag, (-1, 1)) + tf.reduce_sum( tf.expand_dims(tf.square(tf.transpose(A)), 2) * (tf.expand_dims(q_var, 0) - 1.0), 1) return fmean, fvar elif q_sqrt.get_shape().ndims == 3: # we have the cholesky form for q(v) fvar = Kdiag - tf.reduce_sum(np.square(A), 0) projected_var = [] for d in range(num_columns): L = tf.user_ops.triangle(q_sqrt[:, :, d], 'lower') LTA = tf.matmul(tf.transpose(L), A) projected_var.append(fvar + tf.reduce_sum(tf.square(LTA), 0)) fvar = tf.transpose(tf.pack(projected_var)) return fmean, fvar else: raise ValueError, "Bad dimension for q_sqrt: %s" % str( q_sqrt.get_shape().ndims)
def gaussian_gp_predict(Xnew, X, kern, q_mu, q_sqrt, num_columns): """ Given an (approximate) posterior (via q_mu, q_sqrt) to the GP at the points X, produce the mean and variance of the GP at the points Xnew. We assume K independent GPs, represented by the columns of q_mu (and the last ax of q_sqrt). q_mu and q_sqrt are variational posteriors for f, So q(f[:,i]) = N (q_mu[:,i], diag(q_sqrt[:,i]**2)) or q(f[:,i]) = N (q_mu, W W^T) where W is the lower triangle of q_sqrt[:,:,i]. This function computes the Gaussian integral q(f*) = \int p(f*|f)q(f) df. Xnew is a data matrix, size N x D X are inducing points, size M x D q_mu are variational means, size M x K q_sqrt are variational standard-deviations or Cholesky matrices,, size M x K or M x M x K num_columns is the number of columns in q_mu. Note (and TODO): At the moment, num_columns only gets used for the q_sqrt.ndim==3 case, and it tells use the value of q_sqrt.shape()[2]. We need to find a way to get this from the tf graph. See also: gp_predict -- where there is no uncertainty in F gaussian_gp_predict_whitened -- the same, but with whitening (centering) the F variables """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') B = tf.user_ops.triangular_solve(tf.transpose(Lm), A, 'upper') # B is Kmm^{-1} Kmn #construct the mean and variance of q(f*) fmean = tf.matmul(tf.transpose(B), q_mu) fvar = Kdiag - tf.reduce_sum(tf.square(A), 0) fvar = tf.expand_dims(fvar, 1) if q_sqrt.get_shape().ndims == 2: #we hae a diagonal form for q(f) fvar = fvar + tf.reduce_sum( tf.square( tf.expand_dims(tf.transpose(B), 2) * tf.expand_dims(q_sqrt, 0)), 1) elif q_sqrt.get_shape().ndims == 3: # we have the cholesky form for q(v) projected_var = [] for d in range(num_columns): L = tf.user_ops.triangle(q_sqrt[:, :, d], 'lower') LTB = tf.matmul(tf.transpose(L), B) projected_var.append(tf.reduce_sum(tf.square(LTB), 0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) else: raise ValueError, "Bad dimension for q_sqrt: %s" % str( q_sqrt.get_shape().ndims) return fmean, fvar
def conditional(Xnew, X, kern, f, num_columns, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` representes the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). in this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last ax of q_sqrt). Xnew is a data matrix, size N x D X are data points, size M x D kern is a GPflow kernel f is a data matrix, M x K, represensting the function values at X. num_columns is an interger number of columns in the f matrix (must match q_sqrt's last dimension) (optional) q_sqrt is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K (optional) whiten is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one function: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ #compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data)*1e-6 Lm = tf.cholesky(Kmm) #Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) #compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(tf.transpose(A), A) fvar = tf.tile(tf.expand_dims(fvar, 2), [1, 1, num_columns]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.expand_dims(fvar, 1), [1, num_columns]) #another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) #construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) #add extra projected variance from q(f) if needed if q_sqrt is not None: projected_var = [] for d in range(num_columns): if q_sqrt.get_shape().ndims==2: LTA = A*q_sqrt[:,d:d+1] elif q_sqrt.get_shape().ndims==3: L = tf.user_ops.triangle(q_sqrt[:,:,d], 'lower') LTA = tf.matmul(tf.transpose(L), A) else: # pragma no cover raise ValueError, "Bad dimension for q_sqrt: %s"%str(q_sqrt.get_shape().ndims) if full_cov: projected_var.append(tf.matmul(tf.transpose(LTA),LTA)) else: projected_var.append(tf.reduce_sum(tf.square(LTA),0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) return fmean, fvar
def gaussian_gp_predict(Xnew, X, kern, q_mu, q_sqrt, num_columns): """ Given an (approximate) posterior (via q_mu, q_sqrt) to the GP at the points X, produce the mean and variance of the GP at the points Xnew. We assume K independent GPs, represented by the columns of q_mu (and the last ax of q_sqrt). q_mu and q_sqrt are variational posteriors for f, So q(f[:,i]) = N (q_mu[:,i], diag(q_sqrt[:,i]**2)) or q(f[:,i]) = N (q_mu, W W^T) where W is the lower triangle of q_sqrt[:,:,i]. This function computes the Gaussian integral q(f*) = \int p(f*|f)q(f) df. Xnew is a data matrix, size N x D X are inducing points, size M x D q_mu are variational means, size M x K q_sqrt are variational standard-deviations or Cholesky matrices,, size M x K or M x M x K num_columns is the number of columns in q_mu. Note (and TODO): At the moment, num_columns only gets used for the q_sqrt.ndim==3 case, and it tells use the value of q_sqrt.shape()[2]. We need to find a way to get this from the tf graph. See also: gp_predict -- where there is no uncertainty in F gaussian_gp_predict_whitened -- the same, but with whitening (centering) the F variables """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data)*1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') B = tf.user_ops.triangular_solve(tf.transpose(Lm), A, 'upper') # B is Kmm^{-1} Kmn #construct the mean and variance of q(f*) fmean = tf.matmul(tf.transpose(B), q_mu) fvar = Kdiag - tf.reduce_sum(tf.square(A), 0) fvar = tf.expand_dims(fvar, 1) if q_sqrt.get_shape().ndims==2: #we hae a diagonal form for q(f) fvar = fvar + tf.reduce_sum(tf.square(tf.expand_dims(tf.transpose(B), 2) * tf.expand_dims(q_sqrt, 0)),1) elif q_sqrt.get_shape().ndims==3: # we have the cholesky form for q(v) projected_var = [] for d in range(num_columns): L = tf.user_ops.triangle(q_sqrt[:,:,d], 'lower') LTB = tf.matmul(tf.transpose(L), B) projected_var.append(tf.reduce_sum(tf.square(LTB),0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) else: raise ValueError, "Bad dimension for q_sqrt: %s"%str(q_sqrt.get_shape().ndims) return fmean, fvar
def gaussian_gp_predict_whitened(Xnew, X, kern, q_mu, q_sqrt, num_columns): """ Given an (approximate) posterior (via q_mu, q_sqrt) to the GP at the points X, produce the mean and variance of the GP at the points Xnew. Additionally, the GP has been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). We assume K independent GPs, represented by the columns of q_mu (and the last ax of q_sqrt). q_mu and q_sqrt are variational posteriors for v, So q(v[:,i]) = N( q_mu[:,i], diag(q_sqrt[:,i]**2) q(f[:,i]) = N (L q_mu[:,i], L diag(q_sqrt**2) L^T) or q(f[:,i]) = N (L q_mu, L [W W^T] L^T) where W is the lower triangle of q_sqrt[:,:,i]. This function computes the Gaussian integral q(f*) = \int p(f*|(f=Lv))q(v) df. Xnew is a data matrix, size N x D X are data points, size M x D q_mu are variational means, size M x K q_sqrt are variational standard-deviations or Cholesky matrices,, size M x K or M x M x K Note (and TODO): At the moment, num_columns only gets used for the q_sqrt.ndim==3 case, and it tells use the value of q_sqrt.shape()[2]. We need to find a way to get this from the tf graph. See also: gp_predict_whitened -- where there is no uncertainty in V gaussian_gp_predict -- same without the whitening """ #compute kernel stuff num_data = tf.shape(X)[0] Kdiag = kern.Kdiag(Xnew) Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data)*1e-6 Lm = tf.cholesky(Kmm) #this is O(N M^2) A = tf.user_ops.triangular_solve(Lm, Kmn, 'lower') #construct the mean and variance of q(f) fmean = tf.matmul(tf.transpose(A), q_mu) if q_sqrt.get_shape().ndims==2: #we hae a diagonal form for q(v) q_var = np.square(q_sqrt) #fvar = Kdiag[:,None] + tf.reduce_sum((tf.square(tf.transpose(A)))[:,:,None] * (q_var[None, :,:] - 1),1) fvar = tf.reshape(Kdiag, (-1,1)) + tf.reduce_sum(tf.expand_dims(tf.square(tf.transpose(A)), 2) * (tf.expand_dims(q_var, 0) - 1.0),1) return fmean, fvar elif q_sqrt.get_shape().ndims==3: # we have the cholesky form for q(v) fvar = Kdiag - tf.reduce_sum(np.square(A), 0) projected_var = [] for d in range(num_columns): L = tf.user_ops.triangle(q_sqrt[:,:,d], 'lower') LTA = tf.matmul(tf.transpose(L), A) projected_var.append(fvar + tf.reduce_sum(tf.square(LTA),0)) fvar = tf.transpose(tf.pack(projected_var)) return fmean, fvar else: raise ValueError, "Bad dimension for q_sqrt: %s"%str(q_sqrt.get_shape().ndims)
def conditional(Xnew, X, kern, f, num_columns, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` representes the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). in this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last ax of q_sqrt). Xnew is a data matrix, size N x D X are data points, size M x D kern is a GPflow kernel f is a data matrix, M x K, represensting the function values at X. num_columns is an interger number of columns in the f matrix (must match q_sqrt's last dimension) (optional) q_sqrt is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K (optional) whiten is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one function: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ #compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) #Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) #compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(tf.transpose(A), A) fvar = tf.tile(tf.expand_dims(fvar, 2), [1, 1, num_columns]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.expand_dims(fvar, 1), [1, num_columns]) #another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) #construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) #add extra projected variance from q(f) if needed if q_sqrt is not None: projected_var = [] for d in range(num_columns): if q_sqrt.get_shape().ndims == 2: LTA = A * q_sqrt[:, d:d + 1] elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) LTA = tf.matmul(tf.transpose(L), A) else: # pragma no cover raise ValueError, "Bad dimension for q_sqrt: %s" % str( q_sqrt.get_shape().ndims) if full_cov: projected_var.append(tf.matmul(tf.transpose(LTA), LTA)) else: projected_var.append(tf.reduce_sum(tf.square(LTA), 0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) return fmean, fvar