def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], tf.float64) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square( tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), tf.float64) # constant term Lq = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5 * tf.reduce_sum(tf.log(tf.square( tf.batch_matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.batch_matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def gauss_kl(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def gauss_kl_white(q_mu, q_sqrt): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, I) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance. """ KL = 0.5 * tf.reduce_sum(tf.square(q_mu)) # Mahalanobis term KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), tf.float64) # constant term L = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL -= 0.5 * tf.reduce_sum(tf.log(tf.square( tf.batch_matrix_diag_part(L)))) # logdet KL += 0.5 * tf.reduce_sum(tf.square(L)) # Trace term. return KL
def _random_cholesky_array(self, shape): mat = self._rng.rand(*shape) chol = distributions.batch_matrix_diag_transform(mat, transform=tf.nn.softplus) # Zero the upper triangle because we're using this as a true Cholesky factor # in our tests. return tf.batch_matrix_band_part(chol, -1, 0).eval()
def gauss_kl_white(q_mu, q_sqrt, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, I) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ KL = 0.5 * tf.reduce_sum(tf.square(q_mu)) # Mahalanobis term KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL -= 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) KL += 0.5 * tf.reduce_sum(tf.square(Lq)) # Trace term. return KL
def gauss_kl(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square( tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL += -0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _random_chol(self, *shape): mat = self._rng.rand(*shape) chol = distributions.batch_matrix_diag_transform( mat, transform=tf.nn.softplus) chol = tf.batch_matrix_band_part(chol, -1, 0) sigma = tf.batch_matmul(chol, chol, adj_y=True) return chol.eval(), sigma.eval()
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.batch_matmul(x, x, adj_x=True) identity = tf.batch_matrix_band_part(tf.ones_like(xx), 0, 0) if dtype_ == np.float32: tol = 1e-5 else: tol = 1e-14 self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.batch_matmul(x, x, adj_x=True) identity = tf.batch_matrix_band_part(tf.ones_like(xx), 0, 0) if is_single: tol = 1e-5 else: tol = 1e-14 self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.batch_matmul(x, x, adj_x=True) identity = tf.batch_matrix_band_part(tf.ones_like(xx), 0, 0) if dtype_ in (np.float32, np.complex64): tol = 1e-5 else: tol = 1e-14 self.assertAllClose(np.real(identity.eval()), np.real(xx.eval()), atol=tol) self.assertAllClose(np.imag(identity.eval()), np.imag(xx.eval()), atol=tol)
def Test(self): shape = batch_shape_ + shape_ x = tf.constant(np.random.rand(*shape), dtype=dtype_) with self.test_session(use_gpu=use_gpu_): for lower in -1, 0, 1, shape_[-2] - 1: for upper in -1, 0, 1, shape_[-1] - 1: y = tf.batch_matrix_band_part(x, lower, upper) error = tf.test.compute_gradient_error(x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error, 1e-4)
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.batch_matmul(x, x, adj_x=True) identity = tf.batch_matrix_band_part(tf.ones_like(xx), 0, 0) # Any decent SVD code should produce singular vectors that are # orthonormal to (almost) full machine precision. if dtype_ == np.float32: atol = 5e-6 else: atol = 1e-14 self.assertAllClose(identity.eval(), xx.eval(), atol=atol)
def Test(self): shape = batch_shape_ + shape_ x = tf.constant(np.random.rand(*shape), dtype=dtype_) with self.test_session(use_gpu=use_gpu_): for lower in -1, 0, 1, shape_[-2] - 1: for upper in -1, 0, 1, shape_[-1] - 1: y = tf.batch_matrix_band_part(x, lower, upper) error = tf.test.compute_gradient_error( x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error, 1e-4)
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.batch_matmul(x, x, adj_x=True) identity = tf.batch_matrix_band_part(tf.ones_like(xx), 0, 0) if is_single: tol = 1e-5 else: tol = 1e-14 self.assertAllClose(np.real(identity.eval()), np.real(xx.eval()), atol=tol) self.assertAllClose(np.imag(identity.eval()), np.imag(xx.eval()), atol=tol)
def Test(self): mat = np.ones(shape_).astype(dtype_) batch_mat = np.tile(mat, batch_shape + (1, 1)) with self.test_session(use_gpu=use_gpu_): for lower in -1, 0, 1, shape_[-2] - 1: for upper in -1, 0, 1, shape_[-1] - 1: band_np = mat if lower >= 0: band_np = np.triu(band_np, -lower) if upper >= 0: band_np = np.tril(band_np, upper) if batch_shape is not (): band_np = np.tile(band_np, batch_shape + (1, 1)) band = tf.batch_matrix_band_part(batch_mat, lower, upper) self.assertAllEqual(band_np, band.eval())
def _multi_head(self, queries, keys, query_mask, key_mask, num_heads, block_feature=False, scope='multihead', reuse=None): with vs.variable_scope(scope, reuse=reuse): # batch_size * seq_size_q * num_units Q = rnn_cell._linear(tf.reshape(queries, [-1, self.num_units]), self.num_units, True, 1.0, scope='Q') Q = tf.reshape(Q, tf.shape(queries)) # batch_size * seq_size_k * num_units K = rnn_cell._linear(tf.reshape(keys, [-1, self.num_units]), self.num_units, True, 1.0, scope='K') K = tf.reshape(K, tf.shape(keys)) V = rnn_cell._linear(tf.reshape(keys, [-1, self.num_units]), self.num_units, True, 1.0, scope='V') V = tf.reshape(V, tf.shape(keys)) Q_ = tf.pack(tf.split(2, num_heads, Q)) # num_heads * batch_size * seq_size_q *num_units/num_heads K_ = tf.pack(tf.split(2, num_heads, K)) # num_heads * batch_size * seq_size_k * num_units/num_heads V_ = tf.pack(tf.split(2, num_heads, V)) # num_heads * batch_size * seq_size_k * num_units/num_heads len_q = tf.shape(queries)[1] len_k = tf.shape(keys)[1] # Compute weight weights = tf.batch_matmul(Q_, tf.transpose(K_, [0,1,3,2])) \ / ((self.num_units/num_heads) ** 0.5) # num_heads * batch_size * seq_size_q * seq_size_k key_mask = tf.tile(tf.reshape(key_mask, [1, -1, 1, len_k]), [num_heads, 1, len_q, 1]) weights = tf.select(key_mask, weights, tf.ones_like(weights) * (-2**32 + 1)) if block_feature: diag_vals = tf.ones_like(weights[0, 0, :, :]) # seq_size_q * seq_size_k mask = tf.cast(tf.batch_matrix_band_part(diag_vals, -1, 0), tf.bool) mask = tf.tile(tf.reshape(mask, [1, 1, len_q, len_k]), [num_heads, tf.shape(queries)[0], 1, 1]) weights = tf.select(mask, weights, tf.ones_like(weights) * (-2 ** 32 + 1)) weights = tf.reshape(tf.nn.softmax(tf.reshape(weights, [-1, len_k])), [num_heads, -1, len_q, len_k]) # num_heads * batch_size * seq_size_q * num_units/num_heads ctx = tf.batch_matmul(weights, V_) ctx *= tf.reshape(tf.cast(query_mask, tf.float32), [-1, len_q, 1]) # num_heads * batch_size * seq_size_q * num_units/num_heads ctx = tf.concat(2, tf.unpack(ctx)) # batch_size * seq_size_q * num_units ctx = rnn_cell._linear(tf.reshape(ctx, [-1, self.num_units]), self.num_units, True, 1.0, scope='context') ctx = tf.reshape(ctx, [-1, len_q, self.num_units]) drop_ctx = tf.nn.dropout(ctx, keep_prob=self.keep_prob) # Add and Normalization res = layer_normalization(drop_ctx + queries) return res, weights
def vec2lower_triangle(vec, dim): """ Convert a vector M of size (n * m) into a matrix of shape (n, m) [[e^M[0], 0, 0, ..., 0] [M[n-1], e^M[n], 0, 0, ..., 0] [M[2n-1], M[2n], e^M[2n+1], 0, ..., 0] ... [M[m(n-1)], M[m(n-1)+1], ..., M[mn-2], e^M[mn-1]] """ L = tf.reshape(vec, [-1, dim, dim]) if int(tf.__version__.split('.')[1]) >= 10: L = tf.matrix_band_part(L, -1, 0) - tf.matrix_diag( tf.matrix_diag_part(L)) + tf.matrix_diag( tf.exp(tf.matrix_diag_part(L))) else: L = tf.batch_matrix_band_part(L, -1, 0) - tf.batch_matrix_diag( tf.batch_matrix_diag_part(L)) + tf.batch_matrix_diag( tf.exp(tf.batch_matrix_diag_part(L))) return L
def _sample(self, N): """ :param integer N: number of samples :Returns samples picked from the variational posterior. The Kulback_leibler divergence is stored as self._KL """ n = self.num_data R = self.num_latent # Match dimension of the posterior variance to the data. if self.q_diag: sqrt = tf.batch_matrix_diag(tf.transpose(self.q_sqrt)) # [R,n,n] else: sqrt = tf.batch_matrix_band_part( tf.transpose(self.q_sqrt,[2,0,1]), -1, 0) # [R,n,n] # Log determinant of matrix S = q_sqrt * q_sqrt^T logdet_S = tf.cast(N, float_type)*tf.reduce_sum( tf.log(tf.square(tf.batch_matrix_diag_part(sqrt)))) sqrt = tf.tile(tf.expand_dims(sqrt, 1), [1,N,1,1]) # [R,N,n,n] # noraml random samples, [R,N,n,1] v_samples = tf.random_normal([R,N,n,1], dtype=float_type) # Match dimension of the posterior mean, [R,N,n,1] mu = tf.tile(tf.expand_dims(tf.expand_dims( tf.transpose(self.q_mu), 1), -1), [1,N,1,1]) u_samples = mu + tf.batch_matmul(sqrt, v_samples) # Stochastic approximation of the Kulback_leibler KL[q(f)||p(f)] self._KL = - 0.5 * logdet_S\ - 0.5 * tf.reduce_sum(tf.square(v_samples)) \ + 0.5 * tf.reduce_sum(tf.square(u_samples)) # Cholesky factor of kernel [R,N,n,n] L = tf.tile(tf.expand_dims( tf.transpose(self.kern.Cholesky(self.X), [2,0,1]),1), [1,N,1,1]) # mean, sized [N,n,R] mean = tf.tile(tf.expand_dims( self.mean_function(self.X), 0), [N,1,1]) # sample from posterior, [N,n,R] f_samples = tf.transpose( tf.squeeze(tf.batch_matmul(L, u_samples),[-1]), # [R,N,n] [1,2,0]) + mean # return as Dict to deal with return f_samples
def conditional(Xnew, X, kern, f, num_columns, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size N x D - X are data points, size M x D - kern is a GPflow kernel - f is a data matrix, M x K, representing the function values at X. - num_columns is an integer number of columns in the f matrix (must match q_sqrt's last dimension) - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(tf.transpose(A), A) fvar = tf.tile(tf.expand_dims(fvar, 2), [1, 1, num_columns]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.expand_dims(fvar, 1), [1, num_columns]) # another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) # add extra projected variance from q(f) if needed if q_sqrt is not None: projected_var = [] for d in range(num_columns): if q_sqrt.get_shape().ndims == 2: LTA = A * q_sqrt[:, d:d + 1] elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) LTA = tf.matmul(tf.transpose(L), A) else: # pragma no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: projected_var.append(tf.matmul(tf.transpose(LTA), LTA)) else: projected_var.append(tf.reduce_sum(tf.square(LTA), 0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) return fmean, fvar
def vec2trimat(vec, dim): L = tf.reshape(vec, [-1, dim, dim]) L = tf.batch_matrix_band_part(L, -1, 0) - tf.batch_matrix_diag(tf.batch_matrix_diag_part(L)) + \ tf.batch_matrix_diag(tf.exp(tf.batch_matrix_diag_part(L))) return L
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size N x D - X are data points, size M x D - kern is a GPflow kernel - f is a data matrix, M x K, representing the function values at X, for K functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * settings.numerics.jitter_level Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(A, A, transpose_a=True) shape = tf.pack([tf.shape(f)[1], 1, 1]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) shape = tf.pack([tf.shape(f)[1], 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # D x N x N or D x N # another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # D x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.pack([tf.shape(f)[1], 1, 1])) LTA = tf.batch_matmul(L, A_tiled, adj_x=True) # D x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.batch_matmul(LTA, LTA, adj_x=True) # D x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # D x N fvar = tf.transpose(fvar) # N x D or N x N x D return fmean, fvar
def conditional(Xnew, X, kern, f, num_columns, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` representes the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). in this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last ax of q_sqrt). Xnew is a data matrix, size N x D X are data points, size M x D kern is a GPflow kernel f is a data matrix, M x K, represensting the function values at X. num_columns is an interger number of columns in the f matrix (must match q_sqrt's last dimension) (optional) q_sqrt is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K (optional) whiten is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one function: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ #compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) #Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) #compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(tf.transpose(A), A) fvar = tf.tile(tf.expand_dims(fvar, 2), [1, 1, num_columns]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.expand_dims(fvar, 1), [1, num_columns]) #another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) #construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) #add extra projected variance from q(f) if needed if q_sqrt is not None: projected_var = [] for d in range(num_columns): if q_sqrt.get_shape().ndims == 2: LTA = A * q_sqrt[:, d:d + 1] elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) LTA = tf.matmul(tf.transpose(L), A) else: # pragma no cover raise ValueError, "Bad dimension for q_sqrt: %s" % str( q_sqrt.get_shape().ndims) if full_cov: projected_var.append(tf.matmul(tf.transpose(LTA), LTA)) else: projected_var.append(tf.reduce_sum(tf.square(LTA), 0)) fvar = fvar + tf.transpose(tf.pack(projected_var)) return fmean, fvar
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size n x D - X are data points, size m x D - kern is a GPinv kernel - f is a data matrix, m x R, representing the function values at X, for R functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size m x R or m x m x R - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = tf.transpose(kern.K(X, Xnew), [2, 0, 1]) # [R,n,n2] Lm = tf.transpose(kern.Cholesky(X), [2, 0, 1]) # [R,n,n] # Compute the projection matrix A A = tf.batch_matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: # shape [R,n,n] fvar = tf.transpose(kern.K(Xnew), [2, 0, 1]) - tf.matmul( A, A, transpose_a=True) else: # shape [R,n] fvar = tf.transpose(kern.Kdiag(Xnew)) - tf.reduce_sum(tf.square(A), 1) # another backsubstitution in the unwhitened case if not whiten: A = tf.batch_matrix_triangular_solve(tf.transpose(Lm, [0, 2, 1]), A, lower=False) # change shape of f [m,R] -> [R,m,1] f = tf.expand_dims(tf.transpose(f), -1) # construct the conditional mean, sized [m,R] fmean = tf.transpose( tf.squeeze(tf.batch_matmul(tf.transpose(A, [0, 2, 1]), f), [-1])) if q_sqrt is not None: # diagonal case. if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # R x m x n # full cov case elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M LTA = tf.batch_matmul(L, A, adj_x=True) # R x m x n else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.batch_matmul(LTA, LTA, adj_x=True) # R x n x n else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x n fvar = tf.transpose(fvar) # n x R or n x n x R return fmean, fvar