def _elbo_data_term(self, events, Kuu=None): mean, var = self.predict_f(events, full_cov=False, Kuu=Kuu) expect_log_fn_sqr = integrate_log_fn_sqr(mean, var) if self.num_events is None: scale = 1.0 else: minibatch_size = tf.shape(events)[0] scale = to_default_float( self.num_events) / to_default_float(minibatch_size) return scale * tf.reduce_sum(expect_log_fn_sqr)
def build_model(data): kernel = gpflow.kernels.Matern52(lengthscales=0.3) meanf = gpflow.mean_functions.Linear(1.0, 0.0) model = gpflow.models.GPR(data, kernel, meanf, noise_variance=0.01) for p in model.parameters: p.prior = Gamma(to_default_float(1.0), to_default_float(1.0)) return model
def optimised_background_model(X, Y): k = gpflow.kernels.SquaredExponential() m = gpflow.models.GPR(data=(X, Y), kernel=k, mean_function=None) m.kernel.lengthscales = gpflow.Parameter( to_default_float(7.1), transform=tfp.bijectors.Softplus(low=to_default_float(7.))) opt = gpflow.optimizers.Scipy() opt_logs = opt.minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=100)) return m
def __init__(self, with_transform): super().__init__() prior = tfp.distributions.Normal(to_default_float(1.0), to_default_float(1.0)) scale = np.exp(self.log_scale) if with_transform: transform = tfp.bijectors.Shift(to_default_float(0.0))( tfp.bijectors.Scale(to_default_float(scale)) ) else: transform = None self.theta = gpflow.Parameter(self.value, prior=prior, transform=transform)
def Kuu_matern32_fourierfeatures1d(inducing_variable, kernel, jitter=None): a, b, ms = (lambda u: (u.a, u.b, u.ms))(inducing_variable) omegas = 2.0 * np.pi * ms / (b - a) # Cosine block: eq. (114) lamb = np.sqrt(3.0) / kernel.lengthscales four_or_eight = to_default_float(tf.where(omegas == 0, 4.0, 8.0)) d_cos = ( (b - a) * tf.square(tf.square(lamb) + tf.square(omegas)) / tf.pow(lamb, 3) / kernel.variance / four_or_eight ) v_cos = tf.ones_like(d_cos) / tf.sqrt(kernel.variance) cosine_block = LowRank(Diag(d_cos, is_positive_definite=True), v_cos[:, None]) # Sine block: eq. (115) omegas = omegas[tf.not_equal(omegas, 0)] # don't compute omega=0 d_sin = ( (b - a) * tf.square(tf.square(lamb) + tf.square(omegas)) / tf.pow(lamb, 3) / kernel.variance / 8.0 ) v_sin = omegas / lamb / tf.sqrt(kernel.variance) sine_block = LowRank(Diag(d_sin, is_positive_definite=True), v_sin[:, None]) return BlockDiag([cosine_block, sine_block]) # eq. (116)
def _elbo_data_term(self, events, Kuu=None): # E_q [log f_n], log f_n^2 的期望 #print('len of events',len(events)) mean, var = self.predict_f(events, full_cov=False, Kuu=Kuu) expect_log_fn_sqr = integrate_log_fn_sqr(mean, var) if self.num_events is None: scale = 1.0 else: minibatch_size = tf.shape(events)[0] #tf.print('num_events is', self.num_events) #tf.print('minibatch_size is',minibatch_size) scale = to_default_float( self.num_events) / to_default_float(minibatch_size) return scale * tf.reduce_sum( expect_log_fn_sqr) #计算张量沿着某一维度的和,默认计算所有元素的和。
def test_sgpr_qu(): rng = Datum().rng X = to_default_float(rng.randn(100, 2)) Z = to_default_float(rng.randn(20, 2)) Y = to_default_float(np.sin(X @ np.array([[-1.4], [0.5]])) + 0.5 * rng.randn(len(X), 1)) model = gpflow.models.SGPR( (X, Y), kernel=gpflow.kernels.SquaredExponential(), inducing_variable=Z ) gpflow.optimizers.Scipy().minimize(model.training_loss, variables=model.trainable_variables) qu_mean, qu_cov = model.compute_qu() f_at_Z_mean, f_at_Z_cov = model.predict_f(model.inducing_variable.Z, full_cov=True) np.testing.assert_allclose(qu_mean, f_at_Z_mean, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(tf.reshape(qu_cov, (1, 20, 20)), f_at_Z_cov, rtol=1e-5, atol=1e-5)
def upper_bound(self) -> tf.Tensor: """ Upper bound for the sparse GP regression marginal likelihood. Note that the same inducing points are used for calculating the upper bound, as are used for computing the likelihood approximation. This may not lead to the best upper bound. The upper bound can be tightened by optimising Z, just like the lower bound. This is especially important in FITC, as FITC is known to produce poor inducing point locations. An optimisable upper bound can be found in https://github.com/markvdw/gp_upper. The key reference is :: @misc{titsias_2014, title={Variational Inference for Gaussian and Determinantal Point Processes}, url={http://www2.aueb.gr/users/mtitsias/papers/titsiasNipsVar14.pdf}, publisher={Workshop on Advances in Variational Inference (NIPS 2014)}, author={Titsias, Michalis K.}, year={2014}, month={Dec} } The key quantity, the trace term, can be computed via >>> _, v = conditionals.conditional(X, model.inducing_variable.Z, model.kernel, ... np.zeros((len(model.inducing_variable), 1))) which computes each individual element of the trace term. """ X_data, Y_data = self.data num_data = to_default_float(tf.shape(Y_data)[0]) Kdiag = self.kernel(X_data, full_cov=False) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) kuf = Kuf(self.inducing_variable, self.kernel, X_data) I = tf.eye(tf.shape(kuu)[0], dtype=default_float()) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) AAT = tf.linalg.matmul(A, A, transpose_b=True) B = I + AAT / self.likelihood.variance LB = tf.linalg.cholesky(B) # Using the Trace bound, from Titsias' presentation c = tf.maximum(tf.reduce_sum(Kdiag) - tf.reduce_sum(tf.square(A)), 0) # Alternative bound on max eigenval: corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.math.log(2 * np.pi * self.likelihood.variance) logdet = -tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) LC = tf.linalg.cholesky(I + AAT / corrected_noise) v = tf.linalg.triangular_solve(LC, tf.linalg.matmul(A, Y_data) / corrected_noise, lower=True) quad = -0.5 * tf.reduce_sum(tf.square(Y_data)) / corrected_noise + 0.5 * tf.reduce_sum(tf.square(v)) return const + logdet + quad
def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor: r""" Computes the log marginal likelihood, with some slack caused by the jitter. Adding the jitter ensures numerical stability. .. math:: \log p(Y | \theta). """ X, Y = self.data num_data = X.shape[0] output_dim = tf.shape(Y)[1] K = self.kernel(X) k_diag = tf.linalg.diag_part(K) noiseK_L, L = tf.cond( self.likelihood.variance > self.jitter_variance, lambda: ( tf.linalg.cholesky(tf.linalg.set_diag(K, k_diag + self.likelihood.variance)), tf.linalg.cholesky(tf.linalg.set_diag(K, k_diag + self.jitter_variance)), ), lambda: (tf.linalg.cholesky(tf.linalg.set_diag(K, k_diag + self.jitter_variance)),) * 2, ) err = Y - self.mean_function(X) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, K, lower=True) / sigma AAT = tf.linalg.matmul(A, A, transpose_b=True) B = tf.linalg.set_diag(AAT, tf.linalg.diag_part(AAT) + 1) # B = AAT + tf.eye(num_data, dtype=default_float()) # B = AAT + tf.eye(num_data, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma # compute log marginal bound bound = -0.5 * to_default_float(num_data) * to_default_float(output_dim) * np.log(2 * np.pi) bound -= to_default_float(output_dim) * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(noiseK_L))) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) return bound
def gauss_kl_vff(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) q_mu is a vector [N, 1] that contains the mean. q_sqrt is a matrix that is the lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. NOTE: K is a LinearOperator that provides efficient methjods for solve(), log_abs_determinant(), and trace() """ # KL(N₀ || N₁) = ½ [tr(Σ₁⁻¹ Σ₀) + (μ₁ - μ₀)ᵀ Σ₁⁻¹ (μ₁ - μ₀) - k + ln(det(Σ₁)/det(Σ₀))] # N₀ = q; μ₀ = q_mu, Σ₀ = q_sqrt q_sqrtᵀ # N₁ = p; μ₁ = 0, Σ₁ = K # KL(q || p) = # ½ [tr(K⁻¹ q_sqrt q_sqrtᵀA + q_muᵀ K⁻¹ q_mu - k + logdet(K) - logdet(q_sqrt q_sqrtᵀ)] # k = number of dimensions, if q_sqrt is m x m this is m² Kinv_q_mu = K.solve(q_mu) mahalanobis_term = tf.squeeze(tf.matmul(q_mu, Kinv_q_mu, transpose_a=True)) # GPflow: q_sqrt is num_latent_gps x N x N num_latent_gps = to_default_float(tf.shape(q_mu)[1]) logdet_prior = num_latent_gps * K.log_abs_determinant() product_of_dimensions__int = tf.reduce_prod( tf.shape(q_sqrt)[:-1]) # dimensions are integers constant_term = to_default_float(product_of_dimensions__int) Lq = tf.linalg.band_part(q_sqrt, -1, 0) # force lower triangle logdet_q = tf.reduce_sum(tf.math.log(tf.square(tf.linalg.diag_part(Lq)))) # S = tf.matmul(q_sqrt, q_sqrt, transpose_b=True) # trace_term = tf.trace(K.solve(S)) trace_term = tf.squeeze(tf.reduce_sum(Lq * K.solve(Lq), axis=[-1, -2 ])) # [O(N²) instead of O(N³) twoKL = trace_term + mahalanobis_term - constant_term + logdet_prior - logdet_q return 0.5 * twoKL
def kl_div_x(self, X_data_mean, X_data_var) -> tf.Tensor: # KL[q(x) || p(x)] dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else tf.linalg.diag_part(X_data_var)) nq = to_default_float(tf.size(X_data_mean)) kl_div = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var)) kl_div -= 0.5 * nq kl_div += 0.5 * tf.reduce_sum(tf.square(X_data_mean) + dX_data_var) return kl_div
def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ X_data, Y_data = self.data num_inducing = len(self.inducing_variable) num_data = to_default_float(tf.shape(Y_data)[0]) output_dim = to_default_float(tf.shape(Y_data)[1]) err = Y_data - self.mean_function(X_data) Kdiag = self.kernel(X_data, full_cov=False) kuf = Kuf(self.inducing_variable, self.kernel, X_data) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) L = tf.linalg.cholesky(kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma AAT = tf.linalg.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma trace_term = 0.5 * output_dim * tf.reduce_sum(Kdiag) / self.likelihood.variance trace_term -= 0.5 * output_dim * tf.reduce_sum(tf.linalg.diag_part(AAT)) # tr(Kff - Qff) should be positive, numerical issues can arise here assert trace_term > 0.0, f"Trace term negative, should be positive ({trace_term:.4e})." # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += tf.negative(output_dim) * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.math.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound -= trace_term return bound
def detrend_cell(X, Y, detrend_lengthscale): k_trend = gpflow.kernels.SquaredExponential() m = gpflow.models.GPR(data=(X, Y), kernel=k_trend, mean_function=None) m.kernel.lengthscales = gpflow.Parameter( to_default_float(detrend_lengthscale + 0.1), transform=tfp.bijectors.Softplus( low=to_default_float(detrend_lengthscale))) opt = gpflow.optimizers.Scipy() opt_logs = opt.minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=100)) mean, var = m.predict_f(X) Y_detrended = Y - mean Y_detrended = Y_detrended - np.mean(Y_detrended) return k_trend, mean, var, Y_detrended
def create_models(self, data): self.models = [] for i in range(self.num_outputs): kern = gpflow.kernels.SquaredExponential(lengthscales=tf.ones([data[0].shape[1],], dtype=gpflow.config.default_float())) kern.lengthscales.prior = tfd.Gamma(to_default_float(1.1), to_default_float(1/10.0)) # priors have to be included before kern.variance.prior = tfd.Gamma(to_default_float(1.5), to_default_float(1/2.0)) # before the model gets compiled self.models.append(gpflow.models.GPR((data[0], data[1][:, i:i+1]), kernel=kern)) self.models[-1].likelihood.prior = tfd.Gamma(to_default_float(1.2), to_default_float(1/0.05))
def _create_kernel(self): """Creates a kernel from list of strings stored in _kernel_split.""" k = None for i, prod_kern in enumerate(self.kernel_split): sub_k = None for j, kern in enumerate(prod_kern): new_k = getattr(gpflow.kernels, kern)(**self.kernel_params[i + j]) if hasattr(new_k, 'lengthscales') and self.length_scale_prior: new_k.lengthscales.prior = tfp.distributions.InverseGamma( to_default_float(1), to_default_float(1)) if j == 0: sub_k = new_k if self.variance_prior: new_k.variance.prior = tfp.distributions.Gamma( to_default_float(1), to_default_float(1)) else: set_trainable(new_k.variance, False) sub_k *= new_k if i == 0: k = sub_k else: k += sub_k return k
def kl_mvn(self, X_mean, X_var, X_prior_mean, X_prior_var): dX_var = ( X_var if X_var.shape.ndims == 2 else tf.transpose(tf.linalg.diag_part(X_var)) ) NQ = to_default_float(tf.size(X_mean)) # log of determinant of diagonal matrix = log of product of entries = sum of logs of entries KL = -0.5 * tf.reduce_sum(tf.math.log(dX_var)) KL += 0.5 * tf.reduce_sum(tf.math.log(X_prior_var)) KL -= 0.5 * NQ # KL is additive for independent distribution (sums over N) # trace sums over Q (see https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence#Multivariate_normal_distributions) KL += 0.5 * tf.reduce_sum( (tf.square(X_mean - X_prior_mean) + dX_var) / X_prior_var ) return KL
def test_softmax_bernoulli_equivalence(num, dimF, dimY): dF = np.vstack( (np.random.randn(num - 3, dimF), np.array([[-3.0, 0.0], [3, 0.0], [0.0, 0.0]]))) dY = np.vstack((np.random.randn(num - 3, dimY), np.ones((3, dimY)))) > 0 F = to_default_float(dF) Fvar = tf.exp( tf.stack([F[:, 1], -10.0 + tf.zeros(F.shape[0], dtype=F.dtype)], axis=1)) F = tf.stack([F[:, 0], tf.zeros(F.shape[0], dtype=F.dtype)], axis=1) Y = to_default_int(dY) Ylabel = 1 - Y softmax_likelihood = Softmax(dimF) bernoulli_likelihood = Bernoulli(invlink=tf.sigmoid) softmax_likelihood.num_monte_carlo_points = int( 0.3e7) # Minimum number of points to pass the test on CircleCI bernoulli_likelihood.num_gauss_hermite_points = 40 assert_allclose( softmax_likelihood.conditional_mean(F)[:, :1], bernoulli_likelihood.conditional_mean(F[:, :1]), ) assert_allclose( softmax_likelihood.conditional_variance(F)[:, :1], bernoulli_likelihood.conditional_variance(F[:, :1]), ) assert_allclose( softmax_likelihood.log_prob(F, Ylabel), bernoulli_likelihood.log_prob(F[:, :1], Y.numpy()), ) mean1, var1 = softmax_likelihood.predict_mean_and_var(F, Fvar) mean2, var2 = bernoulli_likelihood.predict_mean_and_var( F[:, :1], Fvar[:, :1]) assert_allclose(mean1[:, 0, None], mean2, rtol=2e-3) assert_allclose(var1[:, 0, None], var2, rtol=2e-3) ls_ve = softmax_likelihood.variational_expectations(F, Fvar, Ylabel) lb_ve = bernoulli_likelihood.variational_expectations( F[:, :1], Fvar[:, :1], Y.numpy()) assert_allclose(ls_ve, lb_ve, rtol=5e-3)
def Kuu_matern12_fourierfeatures1d(inducing_variable, kernel, jitter=None): a, b, ms = (lambda u: (u.a, u.b, u.ms))(inducing_variable) omegas = 2.0 * np.pi * ms / (b - a) # Cosine block: lamb = 1.0 / kernel.lengthscales two_or_four = to_default_float(tf.where(omegas == 0, 2.0, 4.0)) d_cos = ((b - a) * (tf.square(lamb) + tf.square(omegas)) / lamb / kernel.variance / two_or_four) # eq. (111) v_cos = tf.ones_like(d_cos) / tf.sqrt(kernel.variance) # eq. (110) cosine_block = LowRank(Diag(d_cos), v_cos[:, None]) # Sine block: omegas = omegas[tf.not_equal(omegas, 0)] # the sine block does not include omega=0 d_sin = ((b - a) * (tf.square(lamb) + tf.square(omegas)) / lamb / kernel.variance / 4.0) # eq. (113) sine_block = Diag(d_sin) return BlockDiag([cosine_block, sine_block]).to_dense()
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) num_inducing = self.inducing_variable.num_inducing psi0 = tf.reduce_sum(expectation(pX, self.kernel)) psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) cov_uu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) L = tf.linalg.cholesky(cov_uu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) log_det_B = 2.0 * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma # KL[q(x) || p(x)] dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else tf.linalg.diag_part(X_data_var)) NQ = to_default_float(tf.size(X_data_mean)) D = to_default_float(tf.shape(Y_data)[1]) KL = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var)) KL += 0.5 * tf.reduce_sum(tf.math.log(self.X_prior_var)) KL -= 0.5 * NQ KL += 0.5 * tf.reduce_sum( (tf.square(X_data_mean - self.X_prior_mean) + dX_data_var) / self.X_prior_var) self.loss_placeholder["KL_x"].append(KL.numpy()) # compute log marginal bound ND = to_default_float(tf.size(Y_data)) bound = -0.5 * ND * tf.math.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(Y_data)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.linalg.diag_part(AAT))) bound -= KL self.loss_placeholder["ELBO"].append(bound.numpy()) return bound
def map_fn(image, label): image = to_default_float(image) / 255.0 label = to_default_float(label) return tf.reshape(image, [-1, image_size]), label
def map_fn(input_slice: Dict[str, tf.Tensor]): updated = input_slice image = to_default_float(updated["image"]) / 255.0 label = to_default_float(updated["label"]) return tf.reshape(image, [-1, image_size]), label
def gauss_kl_ldf(q_mu: tf.Tensor, q_sqrt: tf.Tensor, K: tf.linalg.LinearOperatorDiag): """ Compute the KL divergence KL[q || p] between q(x) = N(m, [email protected]) m = Kuu @ q_mu L = Kuu @ q_sqrt and p(x) = N(0, K) where K is a Diag linear operator p(x) = N(0, I) if K is None We assume L multiple independent distributions, given by the columns of q_mu and the first or last dimension of q_sqrt. Returns the *sum* of the divergences. q_mu is a matrix ([M, L]), each column contains a mean. q_sqrt can be a 3D tensor ([L, M, M]), each matrix within is a lower triangular square-root matrix of the covariance of q. q_sqrt can be a matrix ([M, L]), each column represents the diagonal of a square-root matrix of the covariance of q. K is the covariance of p (positive-definite matrix). In this case it must always be a tf.linalg.LinearOperatorDiag instance as the type hint suggests """ if K is None: is_white = True is_batched_prior = False else: is_white = False is_batched_prior = len(K.shape) == 3 is_diag = len(tf.shape(q_sqrt)) == 2 M, L = tf.shape(q_mu)[0], tf.shape(q_mu)[1] if is_white: alpha = q_mu # [M, L], implying that K is identity else: q_mu = tf.transpose( q_mu)[:, :, None] if is_batched_prior else q_mu # [L, M, 1] or [M, L] alpha = K.solve(q_mu) # [L, M, 1] or [M, L] if is_diag: # if q_sqrt is diagonal q_diag = tf.linalg.LinearOperatorDiag(tf.square(q_sqrt)) # Log-determinant of the covariance of q(x); factor of 2 from fact that q_sqrt is sqrt of whole logdet_qcov = tf.reduce_sum(q_diag.log_abs_determinant()) else: Lq = tf.linalg.band_part(q_sqrt, -1, 0) # force lower triangle # [L, M, M] Lq_diag = tf.linalg.diag_part(Lq) # [L, M] # Log-determinant of the covariance of q(x): logdet_qcov = tf.reduce_sum(tf.math.log(tf.square(Lq_diag))) # Mahalanobis term: μqᵀ Σp⁻¹ μq mahalanobis = tf.reduce_sum(q_mu * alpha) # Constant term: - L * M constant = -to_default_float(tf.size(q_mu, out_type=tf.int64)) # Trace term: tr(Σp⁻¹ Σq) if is_white: if is_diag: trace = tf.reduce_sum(q_diag.trace()) else: trace = tf.reduce_sum(tf.square(Lq)) else: if is_diag and not is_batched_prior: # K is [M, M] and q_sqrt is [M, L]: fast specialisation, we skip needing to take diag_part trace = tf.reduce_sum(K.solve(tf.square(q_sqrt))) else: # K is [L,M,M] or [M,M] and Lq_diag is [L, M] -> [M, L] trace = tf.reduce_sum( K.solve(tf.square(tf.linalg.matrix_transpose(Lq_diag)))) twoKL = mahalanobis + constant - logdet_qcov + trace # Log-determinant of the covariance of p(x): if not is_white: log_det_p = tf.reduce_sum(K.log_abs_determinant()) # If K is [L, M, M], num_latent_gps is no longer implicit, no need to multiply the single kernel logdet scale = 1.0 if is_batched_prior else to_default_float(L) log_det_p *= scale twoKL += log_det_p return 0.5 * twoKL
m.kernel.trainable except AttributeError: print(f'{m.kernel.__class__.__name__} does not have a trainable attribute') # %% set_trainable(m.kernel, False) print_summary(m) # %% [markdown] # ## Priors # # You can set priors in the same way as transforms and trainability, by using `tensorflow_probability` distribution objects. Let's set a Gamma prior on the variance of the Matern32 kernel. # %% k = gpflow.kernels.Matern32() k.variance.prior = tfp.distributions.Gamma(to_default_float(2), to_default_float(3)) print_summary(k) # %% m.kernel.kernels[0].variance.prior = tfp.distributions.Gamma( to_default_float(2), to_default_float(3)) print_summary(m) # %% [markdown] # ## Optimization # # To optimize your model, first create an instance of an optimizer (in this case, `gpflow.optimizers.Scipy`), which has optional arguments that are passed to `scipy.optimize.minimize` (we minimize the negative log likelihood). Then, call the `minimize` method of that optimizer, with your model as the optimization target. Variables that have priors are maximum a priori (MAP) estimated, that is, we add the log prior to the log likelihood, and otherwise use Maximum Likelihood.
def FitModel( bConsider, GPt, GPy, globalBranching, priorConfidence=0.80, M=10, likvar=1.0, kerlen=2.0, kervar=5.0, fDebug=False, maxiter=100, fPredict=True, fixHyperparameters=False, ): """ Fit BGP model :param bConsider: list of candidate branching points :param GPt: pseudotime :param GPy: gene expression. Should be 0 mean for best performance. :param globalBranching: cell labels :param priorConfidence: prior confidence on cell labels :param M: number of inducing points :param likvar: initial value for Gaussian noise variance :param kerlen: initial value for kernel length scale :param kervar: initial value for kernel variance :param fDebug: Print debugging information :param maxiter: maximum number of iterations for optimisation :param fPredict: compute predictive mean and variance :param fixHyperparameters: should kernel hyperparameters be kept fixed or optimised? :return: dictionary of log likelihood, GPflow model, Phi matrix, predictive set of points, mean and variance, hyperparameter values, posterior on branching time """ assert isinstance(bConsider, list), "Candidate B must be list" assert GPt.ndim == 1 assert GPy.ndim == 2 assert ( GPt.size == GPy.size ), "pseudotime and gene expression data must be the same size" assert ( globalBranching.size == GPy.size ), "state space must be same size as number of cells" assert M >= 0, "at least 0 or more inducing points should be given" phiInitial, phiPrior = GetInitialConditionsAndPrior( globalBranching, priorConfidence, infPriorPhi=True ) XExpanded, indices, _ = VBHelperFunctions.GetFunctionIndexListGeneral(GPt) ptb = np.min([np.min(GPt[globalBranching == 2]), np.min(GPt[globalBranching == 3])]) tree = bt.BinaryBranchingTree(0, 1, fDebug=False) tree.add(None, 1, np.ones((1, 1)) * ptb) # B can be anything here (fm, _) = tree.GetFunctionBranchTensor() kb = bk.BranchKernelParam( gpflow.kernels.Matern32(1), fm, b=np.zeros((1, 1)) ) + gpflow.kernels.White(1) kb.kernels[1].variance.assign( 1e-6 ) # controls the discontinuity magnitude, the gap at the branching point set_trainable(kb.kernels[1].variance, False) # jitter for numerics if M == 0: m = assigngp_dense.AssignGP( GPt, XExpanded, GPy, kb, indices, np.ones((1, 1)) * ptb, phiInitial=phiInitial, phiPrior=phiPrior, ) else: ZExpanded = np.ones((M, 2)) ZExpanded[:, 0] = np.linspace(0, 1, M, endpoint=False) ZExpanded[:, 1] = np.array([i for j in range(M) for i in range(1, 4)])[:M] m = assigngp_denseSparse.AssignGPSparse( GPt, XExpanded, GPy, kb, indices, np.ones((1, 1)) * ptb, ZExpanded, phiInitial=phiInitial, phiPrior=phiPrior, ) # Initialise hyperparameters m.likelihood.variance.assign(likvar) m.kernel.kernels[0].kern.lengthscales.assign(kerlen) m.kernel.kernels[0].kern.variance.assign(kervar) if fixHyperparameters: print("Fixing hyperparameters") set_trainable(m.kernel.kernels[0].kern.lengthscales, False) set_trainable(m.likelihood.variance, False) set_trainable(m.kernel.kernels[0].kern.variance, False) else: if fDebug: print("Adding prior logistic on length scale to avoid numerical problems") m.kernel.kernels[0].kern.lengthscales.prior = tfp.distributions.Normal( to_default_float(2.0), to_default_float(1.0) ) m.kernel.kernels[0].kern.variance.prior = tfp.distributions.Normal( to_default_float(3.0), to_default_float(1.0) ) m.likelihood.variance.prior = tfp.distributions.Normal( to_default_float(0.1), to_default_float(0.1) ) # optimization ll = np.zeros(len(bConsider)) Phi_l = list() ttestl_l, mul_l, varl_l = list(), list(), list() hyps = list() for ib, b in enumerate(bConsider): m.UpdateBranchingPoint(np.ones((1, 1)) * b, phiInitial) try: opt = gpflow.optimizers.Scipy() opt.minimize( m.training_loss, variables=m.trainable_variables, options=dict(disp=True, maxiter=maxiter), ) # remember winning hyperparameter hyps.append( { "likvar": m.likelihood.variance.numpy(), "kerlen": m.kernel.kernels[0].kern.lengthscales.numpy(), "kervar": m.kernel.kernels[0].kern.variance.numpy(), } ) ll[ib] = m.log_posterior_density() except Exception as ex: print(f"Unexpected error: {ex} {'-' * 60}\nCaused by model: {m} {'-' * 60}") ll[0] = np.nan # return model so can inspect model return { "loglik": ll, "model": m, "Phi": np.nan, "prediction": {"xtest": np.nan, "mu": np.nan, "var": np.nan}, "hyperparameters": np.nan, "posteriorB": np.nan, } # prediction Phi = m.GetPhi() Phi_l.append(Phi) if fPredict: ttestl, mul, varl = VBHelperFunctions.predictBranchingModel(m) ttestl_l.append(ttestl), mul_l.append(mul), varl_l.append(varl) else: ttestl_l.append([]), mul_l.append([]), varl_l.append([]) iw = np.argmax(ll) postB = GetPosteriorB(ll, bConsider) if fDebug: print( "BGP Maximum at b=%.2f" % bConsider[iw], "CI= [%.2f, %.2f]" % (postB["B_CI"][0], postB["B_CI"][1]), ) assert np.allclose(bConsider[iw], postB["Bmode"]), "%s-%s" % str( postB["B_CI"], bConsider[iw] ) return { "loglik": ll, "Phi": Phi_l[iw], # 'model': m, "prediction": {"xtest": ttestl_l[iw], "mu": mul_l[iw], "var": varl_l[iw]}, "hyperparameters": hyps[iw], "posteriorB": postB, }