def predict_f(self, Xnew: InputData, full_cov: bool = False, full_output_cov: bool = False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points. Note that this is very similar to the SGPR prediction, for which there are notes in the SGPR notebook. Note: This model does not allow full output covariances. :param Xnew: points at which to predict """ if full_output_cov: raise NotImplementedError pX = DiagonalGaussian(self.X_data_mean, self.X_data_var) Y_data = self.data num_inducing = self.inducing_variable.num_inducing psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0, ) jitter = default_jitter() Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) L = tf.linalg.cholesky( covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True) mean = tf.linalg.matmul(tmp2, c, transpose_a=True) if full_cov: var = (self.kernel(Xnew) + tf.linalg.matmul(tmp2, tmp2, transpose_a=True) - tf.linalg.matmul(tmp1, tmp1, transpose_a=True)) shape = tf.stack([1, 1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = (self.kernel(Xnew, full_cov=False) + tf.reduce_sum(tf.square(tmp2), axis=0) - tf.reduce_sum(tf.square(tmp1), axis=0)) shape = tf.stack([1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def compute_qu(self, full_cov: bool = True) -> Tuple[tf.Tensor, tf.Tensor]: """ Computes the mean and variance of q(u) = N(mu, cov), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. The derivation is at follows: q(u)=N(u | m, S) with: S=Kuu^{-1}+ [Kuu^{-1}* Kuf * Kfu * Kuu^{-1} * beta] m=S^{-1} Kuu^{-1} Kuf y beta were sigma^-2 = beta :return: mu, cov """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) # num_inducing = self.inducing_variable.num_inducing #E_qx[Kfu] psi1 = expectation(pX, (self.kernel, self.inducing_variable)) #E_qx[Kuf@Kfu] psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) kuu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) kuf = tf.transpose(psi1) sig = kuu + psi2 * (self.likelihood.variance**-1) sig_sqrt = tf.linalg.cholesky(sig) sig_sqrt_kuu = tf.linalg.triangular_solve(sig_sqrt, kuu) # [M,M] -> [M(M +1)//2] =/= [M,D] cov = tf.linalg.matmul(sig_sqrt_kuu, sig_sqrt_kuu, transpose_a=True) err = Y_data - self.mean_function(X_data_mean) mu = (tf.linalg.matmul(sig_sqrt_kuu, tf.linalg.triangular_solve( sig_sqrt, tf.linalg.matmul(kuf, err)), transpose_a=True) / self.likelihood.variance) if not full_cov: return mu, cov else: return mu, tf.tile(cov[None, :, :], [mu.shape[-1], 1, 1])
def custom_predict_f(self, Xnew: InputData, full_cov: bool = False, full_output_cov: bool = False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points. Note that this is very similar to the SGPR prediction, for which there are notes in the SGPR notebook. Note: This model does not allow full output covariances. :param Xnew: points at which to predict """ if full_output_cov: raise NotImplementedError Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) mu, cov = self.compute_qu() jitter = default_jitter() Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) L = tf.linalg.cholesky( covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) var = cov tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us} tmp2 = tf.linalg.triangular_solve(L, mu, lower=True) # L^{-1} m mean = tf.linalg.matmul( tmp1, tmp2, transpose_a=True ) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook return mean + self.mean_function(Xnew), var
def dirac_diag(): return DiagonalGaussian( tf.convert_to_tensor(Data.Xmu), tf.convert_to_tensor(np.zeros((Data.num_data, Data.D_in))))
def gauss_diag(): return DiagonalGaussian( tf.convert_to_tensor(Data.Xmu), tf.convert_to_tensor(rng.rand(Data.num_data, Data.D_in)))
def uncertain_conditional_diag( Xnew_mu: tf.Tensor, Xnew_var: tf.Tensor, inducing_variable: InducingVariables, kernel: Kernel, q_mu, q_sqrt, *, mean_function=None, full_output_cov=False, full_cov=False, white=False, ): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size [N, D]in :param Xnew_var: covariance matrix of the inputs, size [N, n, n] :param inducing_variable: gpflow.InducingVariable object, only InducingPoints is supported :param kernel: gpflow kernel object. :param q_mu: mean inducing points, size [M, Dout] :param q_sqrt: cholesky of the covariance matrix of the inducing points, size [t, M, M] :param full_output_cov: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param white: boolean whether to use whitened representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is [N, Dout], size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is [N, t, t], if False then ``f_var`` is [N, Dout] """ if not isinstance(inducing_variable, InducingPoints): raise NotImplementedError if full_cov: raise NotImplementedError( "uncertain_conditional() currently does not support full_cov=True") # pX = DiagonalGaussian(self.X_data_mean, self.X_data_var) # Y_data = self.data # mu, cov = self.compute_qu() # jitter = default_jitter() # Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) # L = tf.linalg.cholesky(covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) # var = cov # tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us} # tmp2 = tf.linalg.triangular_solve(L, mu, lower=True) # L^{-1} m # mean = tf.linalg.matmul(tmp1, tmp2, transpose_a=True) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook # return mean + self.mean_function(Xnew), var pXnew = DiagonalGaussian(Xnew_mu, Xnew_var) num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_ind, num_func = tf.unstack( tf.shape(q_mu), num=2, axis=0) # number of inducing points (M), output dimension (D) q_sqrt_r = tf.linalg.band_part( q_sqrt, -1, 0) # [D, M, M] #taking the lower triangular part eKuf = tf.transpose(expectation( pXnew, (kernel, inducing_variable))) # [M, N] (psi1) Kuu = covariances.Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] Luu = tf.linalg.cholesky(Kuu) # [M, M] if not white: q_mu = tf.linalg.triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile( Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.linalg.triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.linalg.triangular_solve(Luu, eKuf, lower=True) # [M, N] fmean = tf.linalg.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = expectation(pXnew, kernel) # N (psi0) eKuffu = expectation(pXnew, (kernel, inducing_variable), (kernel, inducing_variable)) # [N, M, M] (psi2) Luu_tiled = tf.tile( Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu = tf.linalg.triangular_solve(Luu_tiled, eKuffu, lower=True) Li_eKuffu_Lit = tf.linalg.triangular_solve(Luu_tiled, tf.linalg.adjoint(Li_eKuffu), lower=True) # [N, M, M] cov = tf.linalg.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # [D, M, M] if mean_function is None or isinstance(mean_function, mean_functions.Zero): e_related_to_mean = tf.zeros((num_data, num_func, num_func), dtype=default_float()) else: # Update mean: \mu(x) + m(x) fmean = fmean + expectation(pXnew, mean_function) # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T, # where m(x) is the mean_function and \mu(x) is fmean e_mean_mean = expectation(pXnew, mean_function, mean_function) # [N, D, D] Lit_q_mu = tf.linalg.triangular_solve(Luu, q_mu, adjoint=True) e_mean_Kuf = expectation(pXnew, mean_function, (kernel, inducing_variable)) # [N, D, M] # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor: e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind]) e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf, Lit_q_mu) # [N, D, D] e_related_to_mean = e_fmean_mean + tf.linalg.adjoint( e_fmean_mean) + e_mean_mean if full_output_cov: fvar = ( tf.linalg.diag( tf.tile((eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.linalg.diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean) else: fvar = ( (eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) + + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - - fmean**2 + tf.linalg.diag_part(e_related_to_mean)) return fmean, fvar
_means = { "lin": mf.Linear(A=rng.randn(D_in, D_out), b=rng.randn(D_out)), "identity": mf.Identity(input_dim=D_in), "const": mf.Constant(c=rng.randn(D_out)), "zero": mf.Zero(output_dim=D_out), } _distrs = { "gauss": Gaussian(Xmu, Xcov), "dirac_gauss": Gaussian(Xmu, np.zeros((num_data, D_in, D_in))), "gauss_diag": DiagonalGaussian(Xmu, rng.rand(num_data, D_in)), "dirac_diag": DiagonalGaussian(Xmu, np.zeros((num_data, D_in))), "dirac_markov_gauss": MarkovGaussian(Xmu_markov, np.zeros((2, num_data + 1, D_in, D_in))), "markov_gauss": markov_gauss(), } _kerns = { "rbf": kernels.SquaredExponential(variance=rng.rand(), lengthscales=rng.rand() + 1.0), "lin": kernels.Linear(variance=rng.rand()), "matern":
def gplvm_build_predict(self, Xnew, X_mean, X_var, Y, variance, full_cov=False): if X_var is None: # SGPR num_inducing = len(self.feature) err = Y - self.mean_function(X_mean) Kuf = self.feature.Kuf(self.kern, X_mean) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma = tf.sqrt(variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var else: # gplvm pX = DiagonalGaussian(X_mean, X_var) num_inducing = len(self.feature) X_cov = tf.matrix_diag(X_var) if hasattr(self.kern, 'X_input_dim'): psi1 = self.kern.eKxz(self.feature.Z, X_mean, X_cov) psi2 = tf.reduce_sum(self.kern.eKzxKxz(self.feature.Z, X_mean, X_cov), 0) else: psi1 = expectation(pX, (self.kern, self.feature)) psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0) # psi1 = expectation(pX, (self.kern, self.feature)) # psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma2 = variance sigma = tf.sqrt(sigma2) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, Y), lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def gplvm_build_likelihood(self, X_mean, X_var, Y, variance): if X_var is None: # SGPR num_inducing = len(self.feature) num_data = tf.cast(tf.shape(Y)[0], settings.float_type) output_dim = tf.cast(tf.shape(Y)[1], settings.float_type) err = Y - self.mean_function(X_mean) Kdiag = self.kern.Kdiag(X_mean) Kuf = self.feature.Kuf(self.kern, X_mean) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) L = tf.cholesky(Kuu) sigma = tf.sqrt(variance) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma AAT = tf.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += tf.negative(output_dim) * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.log(variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * output_dim * tf.reduce_sum(Kdiag) / variance bound += 0.5 * output_dim * tf.reduce_sum(tf.matrix_diag_part(AAT)) return bound else: X_cov = tf.matrix_diag(X_var) pX = DiagonalGaussian(X_mean, X_var) num_inducing = len(self.feature) if hasattr(self.kern, 'X_input_dim'): psi0 = tf.reduce_sum(self.kern.eKdiag(X_mean, X_cov)) psi1 = self.kern.eKxz(self.feature.Z, X_mean, X_cov) psi2 = tf.reduce_sum(self.kern.eKzxKxz(self.feature.Z, X_mean, X_cov), 0) else: psi0 = tf.reduce_sum(expectation(pX, self.kern)) psi1 = expectation(pX, (self.kern, self.feature)) psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) L = tf.cholesky(Kuu) sigma2 = variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, Y), lower=True) / sigma # KL[q(x) || p(x)] # dX_var = self.X_var if len(self.X_var.get_shape()) == 2 else tf.matrix_diag_part(self.X_var) # NQ = tf.cast(tf.size(self.X_mean), settings.float_type) D = tf.cast(tf.shape(Y)[1], settings.float_type) # KL = -0.5 * tf.reduce_sum(tf.log(dX_var)) \ # + 0.5 * tf.reduce_sum(tf.log(self.X_prior_var)) \ # - 0.5 * NQ \ # + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + dX_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(Y), settings.float_type) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.matrix_diag_part(AAT))) # bound -= KL # don't need this term return bound
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ # defining a sets of vectorized function for usage in `tf.vectorized_map` # take the outer product of a pair of rows @tf.function def row_outer_product(args): a, b = args a = tf.expand_dims(a, -1) b = tf.expand_dims(b, -1) return a @ tf.transpose(b) # repeat matrix A N times on a newly created first axis # so the new shape is [N, A.shape] @tf.function def repeat_N(A): return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0) @tf.function def triang_solve(args): L, rhs = args return tf.linalg.triangular_solve(L, rhs) @tf.function def triang_solve_transpose(args): L, rhs = args return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False) @tf.function def matmul_vectorized(args): A, B = args return tf.matmul(A, B) # [N, D, M, M] --> [N] # each term is sum_{d=1}^D Tr[M, M] # arg: [D, M, M], needs to be squared @tf.function def sum_d_trace(arg): trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg) return tf.reduce_sum(trace_D) # trace of a matrix @tf.function def trace_tf(A): return tf.reduce_sum(tf.linalg.diag_part(A)) Y = self.data qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var) psi0s = expectation(qXs, self.kernel_s) psi1s = expectation(qXs, (self.kernel_s, self.Zs)) psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs)) cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) Ls = tf.linalg.cholesky(cov_uu_s) Ls = repeat_N(Ls) # [N x M x M] # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together # for each k: psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k # psi0 is [N, K] so psi0[n, k] gives a real value # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var) psi0k = [] psi1k = [] psi2k = [] psi2ks = [] psi2sk = [] for k, kernel_k in enumerate(self.kernel_K): psi0 = expectation(qXp, kernel_k) psi1 = expectation(qXp, (kernel_k, self.Zp)) psi2 = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp)) psi0k.append(psi0) psi1k.append(psi1) psi2k.append(psi2) # add the cross-covariance terms, require computation separately for each n psi2sk.append(tf.vectorized_map(row_outer_product, (psi1s, psi1))) #psi2ks.append(tf.vectorized_map(row_outer_product, (psi1, psi1s))) psi0k = tf.stack(psi0k, axis=-1) psi1k = tf.stack(psi1k, axis=-1) psi2k = tf.stack(psi2k, axis=-1) psi2sk = tf.stack(psi2sk, axis=-1) #psi2ks = tf.stack(psi2ks, axis=-1) # make K cov_uu_k using Zp and kernel_k # K cholesky, repeat N times for later use # L is [N x M x M x K] # these are the Kuu matrices Lk = [] for k, kernel_k in enumerate(self.kernel_K): cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter()) Lk.append(tf.linalg.cholesky(cov_uu_k)) Lk = tf.stack(Lk, axis=-1) Lk = repeat_N(Lk) sigma2 = self.likelihood.variance jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float()) tmp = tf.vectorized_map(triang_solve, (Ls, psi2s)) As = tf.vectorized_map(triang_solve_transpose, (Ls, tmp)) # \inv{Kuu^s} * Psi2s: [N, M, M] LBs = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2s) # [N, M, M] tmp1 = tf.vectorized_map(triang_solve, (Ls, LBs)) # [N, M, M] Cs = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1)) # sqrt(\inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, M, M] Ds = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), Cs)) # sqrt(Ms^T * \inv{Kuu^s} * Psi2s * \inv{Kuu^s} * Ms): [N, D, M] Fs = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt_s, perm=[0, 2, 1])), Cs)) # sqrt(Ss * \inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, D, M, M] tmp2 = tf.vectorized_map(triang_solve, (Ls, repeat_N(self.q_mu_s))) Es = tf.vectorized_map(triang_solve_transpose, (Ls, tmp2)) # \inv{Kuu^s} * Ms: [N, M, D] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1s)) # Y^T * Psi1: [N, D, M] Gs = tf.vectorized_map(matmul_vectorized, (tmp3, Es)) # Y^T * Psi1s * \inv{Kuu^s} * Ms: [N, D, D] Fq = [] Yn2 = tf.reduce_sum(tf.square(Y), axis=1) for k in range(self.K): tmp = tf.vectorized_map(triang_solve, (Lk[..., k], psi2k[..., k])) # [N, M, M] Ak = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp)) # \inv{Kuu^k} * Psi2k: [N, M, M] LBk = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2k[..., k]) # [N, M, M] tmp1k = tf.vectorized_map(triang_solve, (Lk[..., k], LBk)) # [N, M, M] Ck = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp1k)) # sqrt(\inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, M, M] Dk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), Ck)) # sqrt(Mk^T * \inv{Kuu^k} * Psi2k * \inv{Kuu^k} * Mk): [N, D, M] # q_sqrt is already the cholesky Fk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), Ck)) # sqrt(Sk * \inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, D, M, M] tmp2 = tf.vectorized_map(triang_solve, (Lk[..., k], repeat_N(self.q_mu[k]))) Ek = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp2)) # \inv{Kuu^k} * Mk: [N, M, D] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1k[..., k])) # Y^T * Psi1k: [N, D, M] Gk = tf.vectorized_map(matmul_vectorized, (tmp3, Ek)) # Y^T * Psi1k * \inv{Kuu^k} * Mk: [N, D, D] # compute the cross terms tmp1sk = tf.vectorized_map(triang_solve, (Ls, psi2sk[..., k])) tmp2sk = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1sk)) # \inv{Kuu^s} * Psi2sk: [N, M, M] tmp3sk = tf.vectorized_map(matmul_vectorized, (tmp2sk, Ek)) # \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, M, D] Dsk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), tmp3sk)) # Ms^T * \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, D, D] # compute the lower bound # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k Fnk = -0.5 * Yn2 / sigma2 Fnk += tf.vectorized_map(trace_tf, Gs + Gk) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Ds) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Dk) / sigma2 # the sum of trace of the 2 cross terms is 2 times the trace of one since they are transpose of one another Fnk += - tf.vectorized_map(trace_tf, Dsk) / sigma2 Fnk += 0.5 * self.D * tf.vectorized_map(trace_tf, As + Ak) / sigma2 Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fs) / sigma2 Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fk) / sigma2 Fq.append(Fnk) Fq = tf.stack(Fq, axis=-1) # [N, K] # psi0 is already [N, K] Fq += -0.5 * self.D * (tf.repeat(tf.expand_dims(psi0s, -1), self.K, axis=1) + psi0k) / sigma2 Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2) # weight each entry by the mixture responsibility, then sum over N, K bound = tf.reduce_sum(Fq * self.pi) # compute KL KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var) KL_c = self.kl_categorical(self.pi, self.pi_prior) KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var) prior_Kuu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) KL_us = kullback_leiblers.gauss_kl(q_mu=self.q_mu_s, q_sqrt=self.q_sqrt_s, K=prior_Kuu_s) KL_uk = 0 for k in range(self.K): prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter()) KL_uk += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu_k) bound += - KL_s - KL_p - KL_us - KL_uk - KL_c return bound
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ # defining a sets of vectorized function for usage in `tf.vectorized_map` # take the outer product of a pair of rows @tf.function def row_outer_product(args): a, b = args a = tf.expand_dims(a, -1) b = tf.expand_dims(b, -1) return a @ tf.transpose(b) # repeat matrix A N times on a newly created first axis # so the new shape is [N, A.shape] @tf.function def repeat_N(A): return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0) @tf.function def triang_solve(args): L, rhs = args return tf.linalg.triangular_solve(L, rhs) @tf.function def triang_solve_transpose(args): L, rhs = args return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False) @tf.function def matmul_vectorized(args): A, B = args return tf.matmul(A, B) # [N, D, M, M] --> [N] # each term is sum_{d=1}^D Tr[M, M] # arg: [D, M, M], needs to be squared @tf.function def sum_d_trace(arg): trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg) return tf.reduce_sum(trace_D) # trace of a matrix @tf.function def trace_tf(A): return tf.reduce_sum(tf.linalg.diag_part(A)) Y = self.data # specify qXp, the variational distribution q(X): each x_n is independent w/ N(x_n | \mu_n, S_n) # \mu_n \in R^q given by each row of `X_data_mean` # S_n \in R^qxq diagonal, so equivalently given by each row of `X_data_var` qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var) # if split space, specify qXs # compute psi statistics for the shared space, keep the original shape of psi statistics, use qXs and kernel_s # psi0s is N-vector # psi1s is [N, M] # psi2s is [N, M, M] # also compute the covariance matrix Kuu for the shared space if self.split_space: qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var) psi0s = expectation(qXs, self.kernel_s) psi1s = expectation(qXs, (self.kernel_s, self.Zs)) psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs)) cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together # for each k: if no shared space, then psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k # if have shared space, then psi0[:, k] = psi0s + psi0k, psi1[:, :, k] = psi1s + psi1k # psi2[:, :, :, k] = psi2s + psi2k (the cross terms are added later) # then, for each n, psi2[n, :, :, k] = psi1s[n, :]^T dot psi1k[n, :] + psi1k[n, :]^T dot psi1s[n, :] (both are [M, M]) # psi0 is [N, K] so psi0[n, k] gives a real value # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix psi0 = [] psi1 = [] psi2 = [] for k, kernel_k in enumerate(self.kernel_K): psi0k = expectation(qXp, kernel_k) psi1k = expectation(qXp, (kernel_k, self.Zp)) psi2k = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp)) if self.split_space: psi0.append(psi0s + psi0k) psi1.append(psi1s + psi1k) # add the cross-covariance terms, require computation separately for each n sxk = tf.vectorized_map(row_outer_product, (psi1s, psi1k)) kxs = tf.vectorized_map(row_outer_product, (psi1k, psi1s)) psi2.append(psi2s + psi2k + sxk + kxs) else: psi0.append(psi0k) psi1.append(psi1k) psi2.append(psi2k) psi0 = tf.stack(psi0, axis=-1) psi1 = tf.stack(psi1, axis=-1) psi2 = tf.stack(psi2, axis=-1) # make K cov_uu_k using Zp and kernel_k # K cholesky, repeat N times for later use # L is [N x M x M x K] # these are the Kuu matrices L = [] for k, kernel_k in enumerate(self.kernel_K): cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter()) if self.split_space: L.append(tf.linalg.cholesky(cov_uu_s + cov_uu_k)) else: L.append(tf.linalg.cholesky(cov_uu_k)) L = tf.stack(L, axis=-1) L = repeat_N(L) sigma2 = self.likelihood.variance # self.pred_Y = [] # use `tf.vectorized_map` to avoid writing a loop over N, but it requires every matrix to have N on axis 0 # so we need to repeat certain matrices that are the same for all N (e.g. L) # note we can use `tf.vectorized_map` because the computations are decomposable for each n, # i.e. they can be computed in any order over n Fq = [] Yn2 = tf.reduce_sum(tf.square(Y), axis=1) for k in range(self.K): # compute intermediate matrices for easier computation involving \inv{Kuu} # A is the same as AAT in gplvm, transposing L is the correct thing to do # but the two end up being the same since we only care about the trace tmp = tf.vectorized_map(triang_solve, (L[..., k], psi2[..., k])) # [N, M, M] A = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp)) # \inv{Kuu} * Psi2: [N, M, M] #pos_def = tf.vectorized_map(lambda x: is_pos_def(x), psi2[..., k]) #print(np.all(pos_def)) # psi2 is not produced using w/ `covariances.Kuu`, but it should still be PD # we should add jitter before doing cholesky #jitter_mtx = default_jitter() * tf.eye(self.M, dtype=default_float()) jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float()) LB = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2[..., k]) # [N, M, M] tmp1 = tf.vectorized_map(triang_solve, (L[..., k], LB)) # [N, M, M] C = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp1)) # sqrt(\inv{Kuu} * Psi2 * \inv{Kuu}): [N, M, M] D = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), C)) # sqrt(M^T * \inv{Kuu} * Psi2 * \inv{Kuu} * M): [N, D, M] tmp2 = tf.vectorized_map(triang_solve, (L[..., k], repeat_N(self.q_mu[k]))) E = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp2)) # \inv{Kuu} * M: [N, M, D] # q_sqrt is already the cholesky F = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), C)) # sqrt(S * \inv{Kuu} * Psi2 * \inv{Kuu}): [N, D, M, M] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1[..., k])) # Y^T * Psi1: [N, D, M] G = tf.vectorized_map(matmul_vectorized, (tmp3, E)) # Y^T * Psi1 * \inv{Kuu} * M: [N, D, D] # for debugging # self.pred_Y.append(tf.reshape(tf.vectorized_map(matmul_vectorized, (tf.expand_dims(psi1[..., k], 1), E)), (self.N, self.D))) # Psi1 * \inv{Kuu} * M: [N, D] # compute the lower bound # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k Fnk = -0.5 * Yn2 / sigma2 Fnk += tf.vectorized_map(lambda x: trace_tf(x), G) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), D) / sigma2 Fnk += 0.5 * self.D * tf.vectorized_map(lambda x: trace_tf(x), A) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: sum_d_trace(x), F) / sigma2 Fq.append(Fnk) Fq = tf.stack(Fq, axis=-1) # [N, K] # psi0 is already [N, K] Fq += -0.5 * self.D * psi0 / sigma2 Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2) # for debugging #self.Fq = Fq # self.pred_Y = tf.stack(self.pred_Y, axis=-1) # [N, D, K] # weight each entry by the mixture responsibility, then sum over N, K bound = tf.reduce_sum(Fq * self.pi) # compute KL KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var) KL_c = self.kl_categorical(self.pi, self.pi_prior) KL_u = 0 prior_Kuu = np.zeros((self.M, self.M)) if self.split_space: KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var) bound += - KL_s prior_Kuu += covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) for k in range(self.K): prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter()) KL_u += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu+prior_Kuu_k) bound += - KL_p - KL_u - KL_c return bound
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) num_inducing = self.inducing_variable.num_inducing psi0 = tf.reduce_sum(expectation(pX, self.kernel)) psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) cov_uu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) L = tf.linalg.cholesky(cov_uu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) log_det_B = 2.0 * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma # KL[q(x) || p(x)] dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else tf.linalg.diag_part(X_data_var)) NQ = to_default_float(tf.size(X_data_mean)) D = to_default_float(tf.shape(Y_data)[1]) KL = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var)) KL += 0.5 * tf.reduce_sum(tf.math.log(self.X_prior_var)) KL -= 0.5 * NQ KL += 0.5 * tf.reduce_sum( (tf.square(X_data_mean - self.X_prior_mean) + dX_data_var) / self.X_prior_var) self.loss_placeholder["KL_x"].append(KL.numpy()) # compute log marginal bound ND = to_default_float(tf.size(Y_data)) bound = -0.5 * ND * tf.math.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(Y_data)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.linalg.diag_part(AAT))) bound -= KL self.loss_placeholder["ELBO"].append(bound.numpy()) return bound
class Data: rng = np.random.RandomState(1) num_data = 5 num_ind = 4 D_in = 2 D_out = 2 Xmu = rng.randn(num_data, D_in) L = gen_L(rng, num_data, D_in, D_in) Xvar = np.array([l @ l.T for l in L]) Z = rng.randn(num_ind, D_in) # distributions don't need to be compiled (No Parameter objects) # but the members should be Tensors created in the same graph graph = tf.Graph() with test_util.session_context(graph) as sess: gauss = Gaussian(tf.constant(Xmu), tf.constant(Xvar)) dirac = Gaussian(tf.constant(Xmu), tf.constant(np.zeros((num_data, D_in, D_in)))) gauss_diag = DiagonalGaussian(tf.constant(Xmu), tf.constant(rng.rand(num_data, D_in))) dirac_diag = DiagonalGaussian(tf.constant(Xmu), tf.constant(np.zeros((num_data, D_in)))) dirac_markov_gauss = MarkovGaussian( tf.constant(Xmu), tf.constant(np.zeros((2, num_data, D_in, D_in)))) # create the covariance for the pairwise markov-gaussian dummy_gen = lambda rng, n, *shape: np.array( [rng.randn(*shape) for _ in range(n)]) L_mg = dummy_gen(rng, num_data, D_in, 2 * D_in) # N+1 x D x 2D LL = np.concatenate((L_mg[:-1], L_mg[1:]), 1) # N x 2D x 2D Xcov = LL @ np.transpose(LL, (0, 2, 1)) Xc = np.concatenate((Xcov[:, :D_in, :D_in], Xcov[-1:, D_in:, D_in:]), 0) # N+1 x D x D Xcross = np.concatenate( (Xcov[:, :D_in, D_in:], np.zeros( (1, D_in, D_in))), 0) # N+1 x D x D Xcc = np.stack([Xc, Xcross]) # 2 x N+1 x D x D markov_gauss = MarkovGaussian(Xmu, Xcc) with gpflow.decors.defer_build(): # features ip = features.InducingPoints(Z) # kernels rbf_prod_seperate_dims = kernels.Product([ kernels.RBF(1, variance=rng.rand(), lengthscales=rng.rand(), active_dims=[0]), kernels.RBF(1, variance=rng.rand(), lengthscales=rng.rand(), active_dims=[1]) ]) rbf_lin_sum = kernels.Sum([ kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand()), kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand()), kernels.Linear(D_in, variance=rng.rand()) ]) rbf = kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand()) lin_kern = kernels.Linear(D_in, variance=rng.rand()) # mean functions lin = mean_functions.Linear(rng.rand(D_in, D_out), rng.rand(D_out)) iden = mean_functions.Identity( D_in) # Note: Identity can only be used if Din == Dout zero = mean_functions.Zero(output_dim=D_out) const = mean_functions.Constant(rng.rand(D_out))