def reparameterize(mean, var, z, full_cov=False): """ Implements the 'reparameterization trick' for the Gaussian, either full rank or diagonal If the z is a sample from N(0, 1), the output is a sample from N(mean, var) If full_cov=True then var must be of shape S,N,N,D and the full covariance is used. Otherwise var must be S,N,D and the operation is elementwise :param mean: mean of shape S,N,D :param var: covariance of shape S,N,D or S,N,N,D :param z: samples form unit Gaussian of shape S,N,D :param full_cov: bool to indicate whether var is of shape S,N,N,D or S,N,D :return sample from N(mean, var) of shape S,N,D """ if var is None: return mean if full_cov is False: return mean + z * (var + gpflow.default_jitter())**0.5 else: S, N, D = tf.shape(mean)[0], tf.shape(mean)[1], tf.shape(mean)[ 2] # var is SNND mean = tf.transpose(mean, (0, 2, 1)) # SND -> SDN var = tf.transpose(var, (0, 3, 1, 2)) # SNND -> SDNN I = gpflow.default_jitter() * tf.eye( N, dtype=gpflow.default_float())[None, None, :, :] # 11NN chol = tf.linalg.cholesky(var + I) # SDNN z_SDN1 = tf.transpose(z, [0, 2, 1])[:, :, :, None] # SND->SDN1 f = mean + tf.matmul(chol, z_SDN1)[:, :, :, 0] # SDN(1) return tf.transpose(f, (0, 2, 1)) # SND
def get_pred_Y_approx(m, by_K=False): pred_Y = np.zeros((m.N, m.D)) if by_K: pred_Y_k = np.zeros((m.N, m.D, m.K)) # fs(xk) Kmm_s = gpflow.covariances.Kuu(m.Zs, m.kernel_s, jitter=gpflow.default_jitter()) Kmn_s = gpflow.covariances.Kuf(m.Zs, m.kernel_s, m.Xs_mean) pred_s = (tf.transpose(Kmn_s) @ tf.linalg.inv(Kmm_s) @ m.q_mu_s).numpy() # fk(xk) for k in range(m.K): kernel = m.kernel_K[k] Kmm = gpflow.covariances.Kuu(m.Zp, kernel, jitter=gpflow.default_jitter()) Kmn = gpflow.covariances.Kuf(m.Zp, kernel, m.Xp_mean) pred = tf.transpose(Kmn) @ tf.linalg.inv(Kmm) @ m.q_mu[k] # [N, D] if by_K: pred_Y_k[..., k] = pred.numpy() assignment = m.pi.numpy()[:, k] pred_Y += pred.numpy() * np.stack([assignment for _ in range(m.D)], axis=1) pred_Y += pred_s if by_K: return pred_Y, pred_Y_k, pred_s else: return pred_Y
def klu(m): KL_u = 0 prior_Kuu = np.zeros((m.M, m.M)) if m.split_space: prior_Kuu += gpflow.covariances.Kuu(m.Zs, m.kernel_s, jitter=gpflow.default_jitter()) for k in range(2): prior_Kuu_k = gpflow.covariances.Kuu(m.Zp, m.kernel_K[k], jitter=gpflow.default_jitter()) KL_u += gpflow.kullback_leiblers.gauss_kl(q_mu=m.q_mu[k], q_sqrt=m.q_sqrt[k], K=prior_Kuu+prior_Kuu_k) return KL_u
def predict_f(self, Xnew, full_cov=False): M = tf.shape(self.X)[0] K = self.kernel.K(self.X) Phi = tf.nn.softmax(self.logPhi) # try squashing Phi to avoid numerical errors Phi = (1 - 2e-6) * Phi + 1e-6 sigma2 = self.likelihood.variance L = (tf.linalg.cholesky(K) + tf.eye(M, dtype=gpflow.default_float()) * gpflow.default_jitter()) W = tf.transpose(L) * tf.sqrt(tf.math.reduce_sum(Phi, 0)) / tf.sqrt(sigma2) P = tf.linalg.matmul(W, tf.transpose(W)) + tf.eye( M, dtype=gpflow.default_float()) R = tf.linalg.cholesky(P) PhiY = tf.linalg.matmul(tf.transpose(Phi), self.Y) LPhiY = tf.linalg.matmul(tf.transpose(L), PhiY) c = tf.linalg.triangular_solve(R, LPhiY, lower=True) / sigma2 Kus = self.kernel.K(self.X, Xnew) tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(R, tmp1, lower=True) mean = tf.linalg.matmul(tf.transpose(tmp2), c) if full_cov: var = (self.kernel.K(Xnew) + tf.linalg.matmul(tf.transpose(tmp2), tmp2) - tf.linalg.matmul(tf.transpose(tmp1), tmp1)) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = (self.kernel.K_diag(Xnew) + tf.math.reduce_sum(tf.math.square(tmp2), 0) - tf.math.reduce_sum(tf.math.square(tmp1), 0)) shape = tf.stack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean, var
def build_cholesky_if_needed(self): # # make sure we only compute this once # if self.needs_build_cholesky: self.Ku = covs.Kuu(self.feature, self.kern, jitter=gpflow.default_jitter()) self.Lu = tf.linalg.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1])
def __call__(self, Xnew, full_cov=False, full_output_cov=False): q_mu = self.q_mu # M x K x O q_sqrt = self.q_sqrt # K x O x M x M Kuu = covariances.Kuu(self.inducing_variables, self.kernel, jitter=default_jitter()) # K x M x M Kuf = covariances.Kuf(self.inducing_variables, self.kernel, Xnew) # K x M x N Knn = self.kernel.K(Xnew, full_output_cov=False)
def __init__(self, kern, Z, num_outputs, mean_function, white=False, input_prop_dim=None, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ super().__init__(input_prop_dim=input_prop_dim, **kwargs) self.num_inducing = Z.shape[0] # Inducing points prior mean q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu, name="q_mu") # Square-root of inducing points prior covariance q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) self.q_sqrt = Parameter(q_sqrt, transform=triangular(), name="q_sqrt") self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if not self.white: # initialize to prior Ku = self.kern.K(Z) Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * gpflow.default_jitter()) self.q_sqrt = Parameter(np.tile(Lu[None, :, :], [num_outputs, 1, 1]), transform=triangular(), name="q_sqrt") self.Ku, self.Lu, self.Ku_tiled, self.Lu_tiled = None, None, None, None self.needs_build_cholesky = True
class Datum: M, N = 5, 4 mu = rng.randn(M, N) # [M, N] A = rng.randn(M, M) I = np.eye(M) # [M, M] K = A @ A.T + default_jitter() * I # [M, M] sqrt = make_sqrt(N, M) # [N, M, M] sqrt_diag = rng.randn(M, N) # [M, N] K_batch = make_K_batch(N, M) K_cholesky = np.linalg.cholesky(K)
def _cholesky_with_jitter(cov: TensorType) -> tf.Tensor: """ Compute the Cholesky of the covariance, adding jitter (determined by :func:`gpflow.default_jitter`) to the diagonal to improve stability. :param cov: full covariance with shape ``[..., N, D, D]``. """ # cov [..., N, D, D] cov_shape = tf.shape(cov) batch_shape = cov_shape[:-2] D = cov_shape[-2] jittermat = default_jitter() * tf.eye( D, batch_shape=batch_shape, dtype=cov.dtype ) # [..., N, D, D] return tf.linalg.cholesky(cov + jittermat) # [..., N, D, D]
def maximum_log_likelihood_objective(self): print("assignegp_dense compiling model (build_likelihood)") N = tf.cast(tf.shape(self.Y)[0], dtype=gpflow.default_float()) M = tf.shape(self.X)[0] D = tf.cast(tf.shape(self.Y)[1], dtype=gpflow.default_float()) if self.KConst is not None: K = tf.cast(self.KConst, gpflow.default_float()) else: K = self.kernel.K(self.X) Phi = tf.nn.softmax(self.logPhi) # try squashing Phi to avoid numerical errors Phi = (1 - 2e-6) * Phi + 1e-6 sigma2 = self.likelihood.variance tau = 1.0 / self.likelihood.variance L = (tf.linalg.cholesky(K) + tf.eye(M, dtype=gpflow.default_float()) * gpflow.default_jitter()) W = tf.transpose(L) * tf.sqrt(tf.reduce_sum(Phi, 0)) / tf.sqrt(sigma2) P = tf.linalg.matmul(W, tf.transpose(W)) + tf.eye( M, dtype=gpflow.default_float()) R = tf.linalg.cholesky(P) PhiY = tf.linalg.matmul(tf.transpose(Phi), self.Y) LPhiY = tf.linalg.matmul(tf.transpose(L), PhiY) if self.fDebug: tf.print(Phi, [tf.shape(P), P], name="P", summarize=10) tf.print(Phi, [tf.shape(LPhiY), LPhiY], name="LPhiY", summarize=10) tf.print(Phi, [tf.shape(K), K], name="K", summarize=10) tf.print(Phi, [tau], name="tau", summarize=10) c = tf.linalg.triangular_solve(R, LPhiY, lower=True) / sigma2 # compute KL KL = self.build_KL(Phi) a1 = -0.5 * N * D * tf.math.log(2.0 * np.pi / tau) a2 = (-0.5 * D * tf.math.reduce_sum( tf.math.log(tf.math.square(tf.linalg.diag_part(R))))) a3 = -0.5 * tf.math.reduce_sum(tf.math.square(self.Y)) / sigma2 a4 = +0.5 * tf.math.reduce_sum(tf.math.square(c)) a5 = -KL if self.fDebug: tf.print(a1, [a1], name="a1=") tf.print(a2, [a2], name="a2=") tf.print(a3, [a3], name="a3=") tf.print(a4, [a4], name="a4=") tf.print(a5, [a5, Phi], name="a5 and Phi=", summarize=10) return a1 + a2 + a3 + a4 + a5
def maximum_log_likelihood_objective(self): if self.fDebug: print("assignegp_denseSparse compiling model (build_likelihood)") N = tf.cast(tf.shape(self.Y)[0], dtype=gpflow.default_float()) M = tf.shape(self.ZExpanded)[0] D = tf.cast(tf.shape(self.Y)[1], dtype=gpflow.default_float()) Phi = tf.nn.softmax(self.logPhi) # try squashing Phi to avoid numerical errors Phi = (1 - 2e-6) * Phi + 1e-6 sigma2 = self.likelihood.variance sigma = tf.sqrt(self.likelihood.variance) Kuu = ( self.kernel.K(self.ZExpanded) + tf.eye(M, dtype=gpflow.default_float()) * gpflow.default_jitter()) Kuf = self.kernel.K(self.ZExpanded, self.X) Kdiag = self.kernel.K_diag(self.X) L = tf.linalg.cholesky(Kuu) A = tf.math.reduce_sum(Phi, 0) LiKuf = tf.linalg.triangular_solve(L, Kuf) W = LiKuf * tf.sqrt(A) / sigma P = tf.linalg.matmul(W, tf.transpose(W)) + tf.eye( M, dtype=gpflow.default_float()) traceTerm = -0.5 * tf.math.reduce_sum( Kdiag * A) / sigma2 + 0.5 * tf.math.reduce_sum(tf.math.square(W)) R = tf.linalg.cholesky(P) tmp = tf.linalg.matmul(LiKuf, tf.linalg.matmul(tf.transpose(Phi), self.Y)) c = tf.linalg.triangular_solve(R, tmp, lower=True) / sigma2 if self.fDebug: # trace term should be 0 for Z=X (full data) tf.print([traceTerm], name="traceTerm", summarize=10) self.bound = ( traceTerm - 0.5 * N * D * tf.math.log(2 * np.pi * sigma2) - 0.5 * D * tf.math.reduce_sum( tf.math.log(tf.math.square(tf.linalg.diag_part(R)))) - 0.5 * tf.math.reduce_sum(tf.math.square(self.Y)) / sigma2 + 0.5 * tf.math.reduce_sum(tf.math.square(c)) - self.build_KL(Phi)) return self.bound
def gauss_kl(q_mu, q_sqrt, K=None): """ Wrapper for gauss_kl from gpflow that returns the negative log prob if q_sqrt is None. This can be for use in HMC: all that is required is to set q_sqrt to None and this function substitues the negative log prob instead of the KL (so no need to set q_mu.prior = gpflow.priors.Gaussian(0, 1)). Also, this allows the use of HMC in the unwhitened case. """ if q_sqrt is None: # return negative log prob with q_mu as 'x', with mean 0 and cov K (or I, if None) M, D = tf.shape(q_mu)[0], tf.shape(q_mu)[1] I = tf.eye(M, dtype=q_mu.dtype) if K is None: L = I else: L = tf.cholesky(K + I * gpflow.default_jitter()) return -tf.reduce_sum( gpflow.logdensities.multivariate_normal(q_mu, tf.zeros_like(q_mu), L)) else: # return kl return gauss_kl_gpflow(q_mu, q_sqrt, K=K)
def K(self, X, Y=None): if Y is None: Y = X # hack to avoid duplicating code below if self.fDebug: print("Compiling kernel") t1s = tf.expand_dims(X[:, 0], 1) # N X 1 t2s = tf.expand_dims(Y[:, 0], 1) i1s_r = tf.expand_dims(X[:, 1], 1) i2s_r = tf.expand_dims(Y[:, 1], 1) if self.fDebug: snl = 10 # how many entries to print i1s = i1s_r i2s = i2s_r tf.print([tf.shape(i1s_r), i1s_r], name="i1sdebug", summarize=snl) # will print message tf.print([tf.shape(i2s_r), i2s_r], name="i2sdebug", summarize=snl) # will print message else: i1s = i1s_r i2s = i2s_r i1s_matrix = tf.tile(i1s, tf.reverse(tf.shape(i2s), [0])) i2s_matrix = tf.tile(i2s, tf.reverse(tf.shape(i1s), [0])) i2s_matrixT = tf.transpose(i2s_matrix) Ktts = self.kern.K(t1s, t2s) # N*M X N*M with tf.name_scope("kttscope"): # scope same_functions = tf.equal(i1s_matrix, tf.transpose(i2s_matrix), name="FiEQFj") K_s = tf.where( same_functions, Ktts, Ktts, name="selectFiEQFj") # just setup matrix with block diagonal m = self.fm.shape[0] for fi in range(m): for fj in range(m): if fi != fj: with tf.name_scope("f" + str(fi) + "f" + str(fj)): # scope # much easier to remove nans before tensorflow bnan = self.fm[fi, fj, ~np.isnan(self.fm[fi, fj, :])] fi_s = tf.constant(fi + 1, tf.int32, name="function" + str(fi)) fj_s = tf.constant(fj + 1, tf.int32, name="function" + str(fj)) i1s_matrixInt = tf.cast(i1s_matrix, tf.int32, name="casti1s") i2s_matrixTInt = tf.cast(i2s_matrixT, tf.int32, name="casti2s") fiFilter = fi_s * tf.ones_like( i1s_matrixInt, tf.int32, name="fiFilter") fjFilter = fj_s * tf.ones_like( i2s_matrixTInt, tf.int32, name="fjFilter") # must be transpose f1F = tf.equal(i1s_matrixInt, fiFilter, name="indexF" + str(fi)) f2F = tf.equal(i2s_matrixTInt, fjFilter, name="indexF" + str(fj)) t12F = tf.logical_and(f1F, f2F, name="F" + str(fi) + "andF" + str(fj)) # Get the actual values of the Bs = B[index of relevant branching points] bint = bnan.astype( int) # convert to int - set of indexes if self.fDebug: Br = self.Bv tf.print([tf.shape(self.Bv), self.Bv], name="Bv", summarize=3) # will print message else: Br = self.Bv Bs = tf.concat( [tf.slice(Br, [i - 1, 0], [1, 1]) for i in bint], 0) kbb = (self.kern.K(Bs) + tf.linalg.diag( tf.ones(tf.shape(Bs)[:1], dtype=gpflow.default_float())) * gpflow.default_jitter()) if self.fDebug: tf.print([tf.shape(kbb), kbb], name="kbb", summarize=10) tf.print( [self.kern.lengthscales.numpy()], name="lenscales", summarize=10, ) tf.print( [self.kern.variance.numpy()], name="lenscales", summarize=10, ) tf.print([Bs], name="Bs", summarize=10) Kbbs_inv = tf.linalg.inv(kbb, name="invKbb") # B X B Kb1s = self.kern.K(t1s, Bs) # N*m X B Kb2s = self.kern.K(t2s, Bs) # N*m X B a = tf.linalg.matmul(Kb1s, Kbbs_inv) K_crosss = tf.linalg.matmul(a, tf.transpose(Kb2s), name="Kt1_Bi_invBB_KBt2") K_s = tf.where(t12F, K_crosss, K_s, name="selectIndex") return K_s
def vaele_jitter(): return gpflow.default_jitter()