def w_init(shape, dtype=dtype): # Sample $u ~ N(f(Z), \epsilon)$ from the prior prior_f = prior_fn(Z.Z) prior_u = prior_f + (default_jitter() ** 0.5) * \ tf.random.normal(prior_f.shape, dtype=prior_f.dtype) # Compute matrix square root $Cov(u, u)^{1/2}$ Suu = gpflow.covariances.Kuu(Z, latent_kernel, jitter=default_jitter()) Luu = tf.linalg.cholesky(Suu) # Sample $u ~ N(q_mu, q_sqrt)$ from inducing distribution $q(u)$ q_mu = model.q_mu[:, latent_dim:latent_dim + 1] # Mx1 q_sqrt = model.q_sqrt[latent_dim] # MxM rvs = tf.random.normal(shape=shape, dtype=dtype) induced_u = q_mu + tf.matmul(q_sqrt, rvs, transpose_b=True) if model.whiten: induced_u = Luu @ induced_u # Solve for $Cov(u, u)^{-1} (u - f(Z))$ init = tf.linalg.adjoint( parallel_solve(solver=tf.linalg.cholesky_solve, lhs=Luu, rhs=induced_u - prior_u)) assert tuple(init.shape) == tuple(shape) return tf.cast(init, dtype)
def _init_backwards_layers(self, X, Y, Z, mean_function=Zero(), optimize_inducing_location=True, Layer=SVGPLayer, white=False): backlayers = [] num_inputs = X.shape[1] num_outputs = Y.shape[1] num_inducing = Z.shape[0] for i in range(num_outputs): if i == 0: inducing_points = Z[:, :num_inputs] else: inducing_points = Z[:, num_inputs + num_outputs - i][:, None] layer = Layer( SquaredExponential(), inducing_points, Z[:, num_inputs + num_outputs - i - 1], [default_jitter()] * num_inducing, mean_function, optimize_inducing_location=optimize_inducing_location, white=white) backlayers.append(layer) return backlayers
def sample(self, batch_size, train_size, num_context, x_min, x_max): # [batch_size, num_context] x = np.random.uniform(x_min, x_max, size=(batch_size, train_size)) x = np.expand_dims( x, 2) # [batch_size, train_size=num_context + num_target, 1] x_context = np.array([ np.random.choice(x[i, :, 0], size=num_context, replace=False) for i in range(batch_size) ]) x_context = np.expand_dims(x_context, 2) knn = self.kernel(x) # [batch_size, train_size, train_size] knn = ops.add_to_diagonal(knn, default_jitter()) Lnn = np.linalg.cholesky(knn) Vnn = np.random.normal(size=(batch_size, train_size, 1)) y = Lnn @ Vnn # [batch_size, train_size] idx = [ np.random.permutation(train_size)[:num_context] for i in range(batch_size) ] x_context = [x[i, idx[i], :] for i in range(batch_size)] x_context = np.array(x_context) y_context = [y[i, idx[i], :] for i in range(batch_size)] y_context = np.array(y_context) return x_context, y_context, x, y
def test_independent_interdomain_conditional_whiten(whiten): """ This test checks the effect of the `white` flag, which changes the projection matrix `A`. The impact of the flag on the value of `A` can be easily verified by its effect on the predicted mean. While the predicted covariance is also a function of `A` this test does not inspect that value. """ N, P = Data.N, Data.P Lm = np.random.randn(1, 1, 1).astype(np.float32) ** 2 Kmm = Lm * Lm + default_jitter() Kmn = tf.ones((1, 1, N, P)) Knn = tf.ones((N, P)) f = np.random.randn(1, 1).astype(np.float32) mean, _ = independent_interdomain_conditional( Kmn, Kmm, Knn, f, white=whiten, ) if whiten: expected_mean = (f * Kmn) / Lm else: expected_mean = (f * Kmn) / Kmm np.testing.assert_allclose(mean, expected_mean[0][0], rtol=1e-2)
def _create_update_fn(batch_shape, prior_fn): Z, u = model.data sigma2 = model.likelihood.variance + default_jitter() if model.mean_function is not None: u = u - model.mean_function(Z) m = Z.shape[-2] Kuu = model.kernel(Z, full_cov=True) Suu = tf.linalg.set_diag(Kuu, tf.linalg.diag_part(Kuu) + sigma2) Luu = tf.linalg.cholesky(Suu) basis = KernelBasis(kernel=model.kernel, centers=Z) def w_init(shape, dtype=dtype): prior_f = prior_fn(Z) prior_u = prior_f + (sigma2 ** 0.5) * \ tf.random.normal(prior_f.shape, dtype=prior_f.dtype) init = tf.linalg.adjoint( parallel_solve(solver=tf.linalg.cholesky_solve, lhs=Luu, rhs=u - prior_u)) assert tuple(init.shape) == tuple(shape) return tf.cast(init, dtype) weights = w_init(shape=batch_shape + [m]) return BayesianLinearSampler(basis=basis, weights=weights, weight_initializer=w_init)
def _sample_joint_conv2d(kern, Z, Xnew, num_samples: int, L: TensorLike = None, diag: Union[float, tf.Tensor] = None): """ Sample from the joint distribution of $f(X), g(Z)$ via a location-scale transform. """ if diag is None: diag = default_jitter() # Construct joint covariance and compute matrix square root if L is None: Zp = Z.as_patches # [M, patch_len] Xp = kern.get_patches(Xnew, full_spatial=False) P = tf.concat([Zp, tf.reshape(Xp, [-1, Xp.shape[-1]])], axis=0) K = kern.kernel(P, full_cov=True) K = tf.linalg.set_diag(K, tf.linalg.diag_part(K) + diag) L = tf.linalg.cholesky(K) L = tf.tile(L[None], [kern.channels_out, 1, 1]) # TODO: Improve me # Draw samples using a location-scale transform spatial_in = Xnew.shape[-3:-1] spatial_out = kern.get_spatial_out(spatial_in) rvs = tf.random.normal(list(L.shape[:-1]) + [num_samples], dtype=floatx()) draws = tf.transpose(L @ rvs) # [S, M + P, L] fz, fx = tf.split(draws, [len(Z), -1], axis=1) # Reorganize $f(X)$ as a 3d feature map fx_shape = [num_samples, Xnew.shape[0]] + spatial_out + [kern.channels_out] fx = tf.reshape(fx, fx_shape) return (fz, fx), L
def __init__(self, kernel, inducing_variables, mean_function, white=False, **kwargs): super().__init__(**kwargs) self.inducing_points = inducing_variables self.num_inducing = inducing_variables.shape[0] m = inducing_variables.shape[1] # Initialise q_mu to y^2_pi(i) q_mu = np.zeros((self.num_inducing, 1)) self.q_mu = Parameter(q_mu, dtype=default_float()) # Initialise q_sqrt to near deterministic. Store as lower triangular matrix L. q_sqrt = 1e-4 * np.eye(self.num_inducing, dtype=default_float()) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) self.kernel = kernel self.mean_function = mean_function self.white = white # Initialise to prior (Ku) + jitter. if not self.white: Ku = self.kernel(self.inducing_points) Ku += default_jitter() * tf.eye(self.num_inducing, dtype=Ku.dtype) Lu = tf.linalg.cholesky(Ku) q_sqrt = Lu self.q_sqrt = Parameter(q_sqrt, transform=triangular())
def _linear_fallback(Z: TensorLike, u: TensorLike, f: TensorLike, *, L: TensorLike = None, diag: TensorLike = None, basis: AbstractBasis = None, **kwargs): u_shape = tuple(u.shape) f_shape = tuple(f.shape) assert u_shape[-1] == 1, "Recieved multiple output features" assert u_shape == f_shape[-len(u_shape):], "Incompatible shapes detected" # Prepare diagonal term if diag is None: # used by <GPflow.conditionals> diag = default_jitter() if isinstance(diag, float): diag = tf.convert_to_tensor(diag, dtype=f.dtype) diag = tf.expand_dims(diag, axis=-1) # [M, 1] or [1, 1] or [1] # Extract "features" of Z if basis is None: if isinstance(Z, inducing_variables.InducingVariables): feat = inducing_to_tensor(Z) # [M, D] else: feat = Z else: feat = basis(Z) # [M, D] (maybe a different "D" than above) # Compute error term and matrix square root $Cov(u, u)^{1/2}$ err = swap_axes(u - f, -3, -1) # [1, M, S] err -= tf.sqrt(diag) * tf.random.normal(err.shape, dtype=err.dtype) M, D = feat.shape[-2:] if L is None: if D < M: feat_iDiag = feat * tf.math.reciprocal(diag) S = tf.matmul(feat_iDiag, feat, transpose_a=True) # [D, D] L = tf.linalg.cholesky(S + tf.eye(S.shape[-1], dtype=S.dtype)) else: K = tf.matmul(feat, feat, transpose_b=True) # [M, M] K = tf.linalg.set_diag(K, tf.linalg.diag_part(K) + diag[..., 0]) L = tf.linalg.cholesky(K) else: assert L.shape[-1] == min(M, D) # TODO: improve me # Solve for $Cov(u, u)^{-1}(u - f(Z))$ if D < M: feat_iDiag = feat * tf.math.reciprocal(diag) weights = tf.linalg.adjoint( tf.linalg.cholesky_solve( L, tf.matmul(feat_iDiag, err, transpose_a=True))) else: iK_err = tf.linalg.cholesky_solve(L, err) # [S, M, 1] weights = tf.matmul(iK_err, feat, transpose_a=True) # [S, 1, D] return DenseSampler(basis=basis, weights=move_axis(weights, -2, -3), **kwargs)
def predict_f(self, Xnew: InputData, full_cov: bool = False, full_output_cov: bool = False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points. Note that this is very similar to the SGPR prediction, for which there are notes in the SGPR notebook. Note: This model does not allow full output covariances. :param Xnew: points at which to predict """ if full_output_cov: raise NotImplementedError pX = DiagonalGaussian(self.X_data_mean, self.X_data_var) Y_data = self.data num_inducing = self.inducing_variable.num_inducing psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0, ) jitter = default_jitter() Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) L = tf.linalg.cholesky( covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True) mean = tf.linalg.matmul(tmp2, c, transpose_a=True) if full_cov: var = (self.kernel(Xnew) + tf.linalg.matmul(tmp2, tmp2, transpose_a=True) - tf.linalg.matmul(tmp1, tmp1, transpose_a=True)) shape = tf.stack([1, 1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = (self.kernel(Xnew, full_cov=False) + tf.reduce_sum(tf.square(tmp2), axis=0) - tf.reduce_sum(tf.square(tmp1), axis=0)) shape = tf.stack([1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def test_inducing_variables_psd_schur(input_dim, inducing_variable, kernel): # Conditional variance must be PSD. X = np.random.randn(5, input_dim) Kuf_values = Kuf(inducing_variable, kernel, X) Kuu_values = Kuu(inducing_variable, kernel, jitter=default_jitter()) Kff_values = kernel(X) Qff_values = Kuf_values.numpy().T @ np.linalg.solve(Kuu_values, Kuf_values) assert np.all(np.linalg.eig(Kff_values - Qff_values)[0] > 0.0)
def main(config): assert config is not None, ValueError tf.random.set_seed(config.seed) gpflow_config.set_default_float(config.floatx) gpflow_config.set_default_jitter(config.jitter) X = tf.random.uniform([config.num_test, config.input_dims], dtype=floatx()) allK = [] allZ = [] Z_shape = config.num_cond, config.input_dims for cls in SupportedBaseKernels: minval = config.rel_lengthscales_min * (config.input_dims**0.5) maxval = config.rel_lengthscales_max * (config.input_dims**0.5) lenscales = tf.random.uniform(shape=[config.input_dims], minval=minval, maxval=maxval, dtype=floatx()) rel_variance = tf.random.uniform(shape=[], minval=0.9, maxval=1.1, dtype=floatx()) allK.append( cls(lengthscales=lenscales, variance=config.kernel_variance * rel_variance)) allZ.append( InducingPoints(tf.random.uniform(Z_shape, dtype=floatx()))) kern = kernels.SeparateIndependent(allK) Z = SeparateIndependentInducingVariables(allZ) Kuu = covariances.Kuu(Z, kern, jitter=gpflow_config.default_jitter()) q_sqrt = tf.linalg.cholesky(Kuu)\ * tf.random.uniform(shape=[kern.num_latent_gps, 1, 1], minval=0.0, maxval=0.5, dtype=floatx()) const = tf.random.normal([len(kern.kernels)], dtype=floatx()) model = SVGP(kernel=kern, likelihood=None, inducing_variable=Z, mean_function=mean_functions.Constant(c=const), q_sqrt=q_sqrt, whiten=False, num_latent_gps=len(allK)) mf, Sff = subroutine(config, model, X) mg, Sgg = model.predict_f(X, full_cov=True) tol = config.error_tol assert allclose(mf, mg, tol, tol) assert allclose(Sff, Sgg, tol, tol)
def KL(self): """The KL divergence from variational distribution to the prior.""" if self.white: return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt[None, :, :], None) else: K = self.kernel(self.inducing_points) K += default_jitter() * tf.eye(self.num_inducing, dtype=K.dtype) return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt[None, :, :], K)
def compute_qu(self, full_cov: bool = True) -> Tuple[tf.Tensor, tf.Tensor]: """ Computes the mean and variance of q(u) = N(mu, cov), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. The derivation is at follows: q(u)=N(u | m, S) with: S=Kuu^{-1}+ [Kuu^{-1}* Kuf * Kfu * Kuu^{-1} * beta] m=S^{-1} Kuu^{-1} Kuf y beta were sigma^-2 = beta :return: mu, cov """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) # num_inducing = self.inducing_variable.num_inducing #E_qx[Kfu] psi1 = expectation(pX, (self.kernel, self.inducing_variable)) #E_qx[Kuf@Kfu] psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) kuu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) kuf = tf.transpose(psi1) sig = kuu + psi2 * (self.likelihood.variance**-1) sig_sqrt = tf.linalg.cholesky(sig) sig_sqrt_kuu = tf.linalg.triangular_solve(sig_sqrt, kuu) # [M,M] -> [M(M +1)//2] =/= [M,D] cov = tf.linalg.matmul(sig_sqrt_kuu, sig_sqrt_kuu, transpose_a=True) err = Y_data - self.mean_function(X_data_mean) mu = (tf.linalg.matmul(sig_sqrt_kuu, tf.linalg.triangular_solve( sig_sqrt, tf.linalg.matmul(kuf, err)), transpose_a=True) / self.likelihood.variance) if not full_cov: return mu, cov else: return mu, tf.tile(cov[None, :, :], [mu.shape[-1], 1, 1])
def build_cache(cls, model: gpflow.models.SVGP): assert model.q_sqrt.shape.ndims == 3 and model.q_sqrt.shape[0] == 1 q_mu = model.q_mu q_sqrt = model.q_sqrt[0] Z = model.inducing_variable Suu = gpflow.covariances.Kuu(Z, model.kernel, jitter=default_jitter()) return CacheLocationScaleSamplerSVGP(Z=Z, Luu=tf.linalg.cholesky(Suu), q_mu=q_mu, q_sqrt=q_sqrt)
def conditional_ND(self, X, full_cov=False): # X is [S,N,D] Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) Lmm = tf.linalg.cholesky(Kmm) Kmm_tiled = tf.tile(tf.expand_dims(Kmm, 0), (self.num_outputs, 1, 1)) Lmm_tiled = tf.tile(tf.expand_dims(Lmm, 0), (self.num_outputs, 1, 1)) Kmn = Kuf(self.inducing_points, self.kernel, X) # K(Z,X) # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function. mean = tf.matmul(A, self.q_mu, transpose_a=True) # [N] # [D_out,M,N] A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :] # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: # -k(Z,Z) SK = -Kmm_tiled # [D_out,M,M] if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T # [D_out,M,M] SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A_tiled) # [D_out,M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) # [D_out,N,N] # Knn = k(X,X) Knn = self.kernel.K(X) else: # Summing over dimension 1 --> sum variances due to other. # Is this legit? delta_cov = tf.reduce_sum(A_tiled * B, 1) #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B, # transpose_a=True)) # [D_out,N] Knn = self.kernel.K_diag(X) # [N] var = tf.expand_dims(Knn, 0) + delta_cov # [D_out,N] var = tf.transpose(var) return mean + self.mean_function(X), var
def __init__(self, kernel, inducing_variables, num_outputs, mean_function, input_prop_dim=None, white=False, **kwargs): super().__init__(input_prop_dim, **kwargs) self.num_inducing = inducing_variables.shape[0] self.mean_function = mean_function self.num_outputs = num_outputs self.white = white self.kernels = [] for i in range(self.num_outputs): self.kernels.append(copy.deepcopy(kernel)) # Initialise q_mu to all zeros q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu, dtype=default_float()) # Initialise q_sqrt to identity function #q_sqrt = tf.tile(tf.expand_dims(tf.eye(self.num_inducing, # dtype=default_float()), 0), (num_outputs, 1, 1)) q_sqrt = [ np.eye(self.num_inducing, dtype=default_float()) for _ in range(num_outputs) ] q_sqrt = np.array(q_sqrt) # Store as lower triangular matrix L. self.q_sqrt = Parameter(q_sqrt, transform=triangular()) # Initialise to prior (Ku) + jitter. if not self.white: Kus = [ self.kernels[i].K(inducing_variables) for i in range(self.num_outputs) ] Lus = [ np.linalg.cholesky(Kus[i] + np.eye(self.num_inducing) * default_jitter()) for i in range(self.num_outputs) ] q_sqrt = Lus q_sqrt = np.array(q_sqrt) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) self.inducing_points = [] for i in range(self.num_outputs): self.inducing_points.append( inducingpoint_wrapper(inducing_variables))
def build_cache(cls, model: gpflow.models.GPR): Z, err = model.data sigma2 = model.likelihood.variance + default_jitter() if model.mean_function is not None: err -= model.mean_function(Z) Kuu = model.kernel(Z, full_cov=True) Suu = tf.linalg.set_diag(Kuu, tf.linalg.diag_part(Kuu) + sigma2) Luu = tf.linalg.cholesky(Suu) iLuu_err = parallel_solve(tf.linalg.triangular_solve, Luu, err) return CacheLocationScaleSamplerGPR(Z=Z, Luu=tf.linalg.cholesky(Suu), iLuu_err=iLuu_err)
def residual_variances(model): X_data, Y_data = model.data Kdiag = model.kernel(X_data, full_cov=False) kuu = Kuu(model.inducing_variable, model.kernel, jitter=default_jitter()) kuf = Kuf(model.inducing_variable, model.kernel, X_data) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) c = Kdiag - tf.reduce_sum(tf.square(A), 0) return c.numpy()
def reparameterise(mean, var, z, full_cov=False): """Implements the reparameterisation trick for the Gaussian, either full rank or diagonal. If z is a sample from N(0,I), the output is a sample from N(mean,var). :mean: A tensor, the mean of shape [S,N,1]. :var: A tensor, the coariance of shape [S,N,1] or [S,N,N]. :z: A tensor, samples from a unit Gaussian of shape [S,N,1]. :full_cov: A boolean, indicates the shape of var.""" if var is None: return mean if full_cov is False: return mean + z * (var + default_jitter())**0.5 else: S, N = tf.shape(mean)[0], tf.shape(mean)[1] I = default_jitter() * tf.eye(N, dtype=default_float())\ [None, :, :] # [1,N,N] chol = tf.linalg.cholesky(var + I) # [S,N,N] f = mean + tf.matmul(chol, z) return f # [S,N,1]
def _conditional_train( Xnew: tf.Tensor, inducing_variable: InducingVariables, kernel: Kernel, f: tf.Tensor, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False, ): """ Single-output GP conditional. The covariance matrices used to calculate the conditional have the following shape: - Kuu: [M, M] - Kuf: [M, N] - Kff: [N, N] Further reference ----------------- - See `gpflow.conditionals._conditional` (below) for a detailed explanation of conditional in the single-output case. - See the multiouput notebook for more information about the multiouput framework. Parameters ---------- :param Xnew: data matrix, size [N, D]. :param f: data matrix, [M, R] :param full_cov: return the covariance between the datapoints :param full_output_cov: return the covariance between the outputs. NOTE: as we are using a single-output kernel with repetitions these covariances will be zero. :param q_sqrt: matrix of standard-deviations or Cholesky matrices, size [M, R] or [R, M, M]. :param white: boolean of whether to use the whitened representation :return: - mean: [N, R] - variance: [N, R], [R, N, N], [N, R, R] or [N, R, N, R] Please see `gpflow.conditional._expand_independent_outputs` for more information about the shape of the variance, depending on `full_cov` and `full_output_cov`. """ Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] Kmn = Kuf(inducing_variable, kernel, Xnew) # [M, N] Knn = kernel.diag_tr() #uses optimzied function to calculate the covariances fmean, fvar = base_conditional( Kmn, Kmm, Knn, f, full_cov=full_cov, q_sqrt=q_sqrt, white=white ) # [N, R], [R, N, N] or [N, R] return fmean, expand_independent_outputs(fvar, full_cov, full_output_cov)
def _generate_u(self, num_samples: int, L: tf.Tensor = None): """ Returns samples $u ~ q(u)$. """ q_sqrt = tf.linalg.band_part(self.q_sqrt, -1, 0) shape = self.num_latent_gps, q_sqrt.shape[-1], num_samples rvs = tf.random.normal(shape, dtype=default_float()) # [L, M, S] uT = q_sqrt @ rvs + tf.transpose(self.q_mu)[..., None] if self.whiten: if L is None: Z = self.inducing_variable K = covariances.Kuu(Z, self.kernel, jitter=default_jitter()) L = tf.linalg.cholesky(K) uT = L @ uT return tf.transpose(uT) # [S, M, L]
def test_fully_correlated_conditional_repeat_shapes(func, R): L, M, N, P = Data.L, Data.M, Data.N, Data.P Kmm = tf.ones((L * M, L * M)) + default_jitter() * tf.eye(L * M) Kmn = tf.ones((L * M, N, P)) Knn = tf.ones((N, P)) f = tf.ones((L * M, R)) q_sqrt = None white = True m, v = func( Kmn, Kmm, Knn, f, full_cov=False, full_output_cov=False, q_sqrt=q_sqrt, white=white, ) assert v.shape.as_list() == m.shape.as_list()
def __call__(self, X: TensorType, sample_shape: List[int] = None, full_cov: bool = None) -> tf.Tensor: if full_cov is None: full_cov = self.full_cov if sample_shape is None: sample_shape = self.sample_shape # Get or compute required terms Z, Luu, q_mu, q_sqrt = self.cache rvs = tf.random.normal(shape=list(sample_shape) + list(X.shape[:-1]), dtype=X.dtype) # Solve for $Cov(u, u)^{-1/2} Cov(u, f)$ # [!] Fix me: doesn't broadcast in the desired way # Kuf = gpflow.covariances.Kuf(Z, self.model.kernel, X) Kuf = tf.linalg.adjoint(self.model.kernel(X, Z.Z)) iLuu_Kuf = parallel_solve(tf.linalg.triangular_solve, Luu, Kuf) # Compute and draw samples from posterior if self.model.whiten: m = tf.matmul(iLuu_Kuf, q_mu, transpose_a=True) A = tf.matmul(iLuu_Kuf, q_sqrt, transpose_a=True) else: iSuu_Kuf = parallel_solve(tf.linalg.triangular_solve, tf.linalg.adjoint(Luu), iLuu_Kuf, lower=False) m = tf.matmul(iSuu_Kuf, q_mu, transpose_a=True) A = tf.matmul(iSuu_Kuf, q_sqrt, transpose_a=True) if self.model.mean_function is not None: m += self.model.mean_function(X) if full_cov: S = self.model.kernel(X, full_cov=True) \ + tf.matmul(A, A, transpose_b=True) \ - tf.matmul(iLuu_Kuf, iLuu_Kuf, transpose_a=True) L = tf.linalg.cholesky( tf.linalg.set_diag(S, tf.linalg.diag_part(S) + default_jitter())) return m + tf.expand_dims(tf.linalg.matvec(L, rvs), -1) v = self.model.kernel(X, full_cov=False) \ + tf.reduce_sum(tf.square(A) - tf.square(iLuu_Kuf), axis=-2) return m + tf.expand_dims(tf.sqrt(v) * rvs, axis=-1)
def test_equivalence_vgp_and_opper_archambeau(): kernel = gpflow.kernels.Matern52() likelihood = gpflow.likelihoods.StudentT() vgp_oa_model = _create_vgpao_model(kernel, likelihood, DatumVGP.q_alpha, DatumVGP.q_lambda) K = kernel(DatumVGP.X) + np.eye(DatumVGP.N) * default_jitter() L = np.linalg.cholesky(K) L_inv = np.linalg.inv(L) K_inv = np.linalg.inv(K) mean = K @ DatumVGP.q_alpha prec_dnn = K_inv[None, :, :] + np.array( [np.diag(l**2) for l in DatumVGP.q_lambda.T]) var_dnn = np.linalg.inv(prec_dnn) svgp_model_unwhitened = _create_svgp_model(kernel, likelihood, mean, np.linalg.cholesky(var_dnn), whiten=False) mean_white_nd = L_inv.dot(mean) var_white_dnn = np.einsum('nN,dNM,mM->dnm', L_inv, var_dnn, L_inv) q_sqrt_nnd = np.linalg.cholesky(var_white_dnn) vgp_model = _create_vgp_model(kernel, likelihood, mean_white_nd, q_sqrt_nnd) likelihood_vgp = vgp_model.log_likelihood() likelihood_vgp_oa = vgp_oa_model.log_likelihood() likelihood_svgp_unwhitened = svgp_model_unwhitened.log_likelihood( DatumVGP.data) assert_allclose(likelihood_vgp, likelihood_vgp_oa, rtol=1e-2) assert_allclose(likelihood_vgp, likelihood_svgp_unwhitened, rtol=1e-2) vgp_oa_mu, vgp_oa_var = vgp_oa_model.predict_f(DatumVGP.Xs) svgp_unwhitened_mu, svgp_unwhitened_var = svgp_model_unwhitened.predict_f( DatumVGP.Xs) vgp_mu, vgp_var = vgp_model.predict_f(DatumVGP.Xs) assert_allclose(vgp_oa_mu, vgp_mu) assert_allclose(vgp_oa_var, vgp_var, rtol=1e-4) # jitter? assert_allclose(svgp_unwhitened_mu, vgp_mu) assert_allclose(svgp_unwhitened_var, vgp_var, rtol=1e-4)
def _test_cg_svgp(config: ConfigDense, model: SVGP, Xnew: tf.Tensor) -> tf.Tensor: """ Sample generation subroutine common to each unit test """ # Prepare preconditioner for CG Z = model.inducing_variable Kff = covariances.Kuu(Z, model.kernel, jitter=0) max_rank = config.num_cond//(2 if config.num_cond > 1 else 1) preconditioner = get_default_preconditioner(Kff, diag=default_jitter(), max_rank=max_rank) count = 0 samples = [] L_joint = None while count < config.num_samples: # Sample $u ~ N(q_mu, q_sqrt q_sqrt^{T})$ size = min(config.shard_size, config.num_samples - count) shape = model.num_latent_gps, config.num_cond, size rvs = tf.random.normal(shape=shape, dtype=floatx()) u = tf.transpose(model.q_sqrt @ rvs) # Generate draws from the joint distribution $p(f(X), g(Z))$ (f, fnew), L_joint = common.sample_joint(model.kernel, Z, Xnew, num_samples=size, L=L_joint) # Solve for update functions update_fns = cg_update(model.kernel, Z, u, f, tol=1e-6, max_iter=config.num_cond, preconditioner=preconditioner) samples.append(fnew + update_fns(Xnew)) count += size samples = tf.concat(samples, axis=0) if model.mean_function is not None: samples += model.mean_function(Xnew) return samples
def _exact_independent(kern: kernels.MultioutputKernel, Z: TensorLike, u: TensorLike, f: TensorLike, *, L: TensorLike = None, diag: TensorLike = None, basis: AbstractBasis = None, multioutput_axis: int = 0, **kwargs): """ Return (independent) pathwise updates for each of the latent prior processes $f$ subject to the condition $p(f | u) = N(f | u, diag)$ on $f = f(Z)$. """ u_shape = tuple(u.shape) f_shape = tuple(f.shape) assert u_shape[ -1] == kern.num_latent_gps, "Num. outputs != num. latent GPs" assert u_shape == f_shape[-len(u_shape):], "Incompatible shapes detected" if basis is None: # finite-dimensional basis used to express the update basis = kernel_basis(kern, centers=Z) # Prepare diagonal term if diag is None: # used by <GPflow.conditionals> diag = default_jitter() if isinstance(diag, float): diag = tf.convert_to_tensor(diag, dtype=f.dtype) diag = tf.expand_dims(diag, axis=-1) # ([L] or []) + ([M] or []) + [1] # Compute error term and matrix square root $Cov(u, u)^{1/2}$ err = swap_axes(u - f, -3, -1) # [L, M, S] err -= tf.sqrt(diag) * tf.random.normal(err.shape, dtype=err.dtype) if L is None: if isinstance(Z, inducing_variables.InducingVariables): K = covariances.Kuu(Z, kern, jitter=0.0) else: K = kern(Z, full_cov=True, full_output_cov=False) K = tf.linalg.set_diag(K, tf.linalg.diag_part(K) + diag[..., 0]) L = tf.linalg.cholesky(K) # Solve for $Cov(u, u)^{-1}(u - f(Z))$ weights = move_axis(tf.linalg.cholesky_solve(L, err), -1, -3) # [S, L, M] return MultioutputDenseSampler(basis=basis, weights=weights, multioutput_axis=multioutput_axis, **kwargs)
def main(config): assert config is not None, ValueError tf.random.set_seed(config.seed) gpflow_config.set_default_float(config.floatx) gpflow_config.set_default_jitter(config.jitter) X = tf.random.uniform([config.num_test, config.input_dims], dtype=floatx()) Z_shape = config.num_cond, config.input_dims for cls in SupportedBaseKernels: minval = config.rel_lengthscales_min * (config.input_dims**0.5) maxval = config.rel_lengthscales_max * (config.input_dims**0.5) lenscales = tf.random.uniform(shape=[config.input_dims], minval=minval, maxval=maxval, dtype=floatx()) base = cls(lengthscales=lenscales, variance=config.kernel_variance) kern = kernels.SharedIndependent(base, output_dim=2) Z = SharedIndependentInducingVariables( InducingPoints(tf.random.uniform(Z_shape, dtype=floatx()))) Kuu = covariances.Kuu(Z, kern, jitter=gpflow_config.default_jitter()) q_sqrt = tf.stack([ tf.zeros(2 * [config.num_cond], dtype=floatx()), tf.linalg.cholesky(Kuu) ]) const = tf.random.normal([2], dtype=floatx()) model = SVGP(kernel=kern, likelihood=None, inducing_variable=Z, mean_function=mean_functions.Constant(c=const), q_sqrt=q_sqrt, whiten=False, num_latent_gps=2) mf, Sff = subroutine(config, model, X) mg, Sgg = model.predict_f(X, full_cov=True) tol = config.error_tol assert allclose(mf, mg, tol, tol) assert allclose(Sff, Sgg, tol, tol)
def conditional(self, X, full_cov=False): # X is [N,D] or [S*N,D] #Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) #[M,M] Kmm = self.kernel(self.inducing_points) Kmm += default_jitter() * tf.eye(self.num_inducing, dtype=Kmm.dtype) Lmm = tf.linalg.cholesky(Kmm) #Kmn = Kuf(self.inducing_points, self.kernel, X) #[M,N] Kmn = self.kernel(self.inducing_points, X) # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) mean = tf.matmul(A, self.q_mu - self.mean_function(self.inducing_points), transpose_a=True) # [N,1] I = tf.eye(self.num_inducing, dtype=default_float()) # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: SK = -Kmm if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A) #[M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A, B, transpose_a=True) # [N,N] Knn = self.kernel(X, full_cov=True, presliced=False) else: delta_cov = tf.reduce_sum(A * B, 0) Knn = self.kernel(X, full_cov=False, presliced=False) var = Knn + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var
def log_likelihood(self): """ Computes the log likelihood. """ x, y = self.data K = self.kernel(x) num_data = x.shape[0] k_diag = tf.linalg.diag_part(K) s_diag = tf.convert_to_tensor(self.likelihood.variance) jitter = tf.cast(tf.fill([num_data], default_jitter()), 'float64') # stabilize K matrix w/jitter ks = tf.linalg.set_diag(K, k_diag + s_diag + jitter) L = tf.linalg.cholesky(ks) m = self.mean_function(x) # [R,] log-likelihoods for each independent dimension of Y log_prob = multivariate_normal(y, m, L) return tf.reduce_sum(log_prob)
def _sample_joint_inducing(kern, Z, Xnew, num_samples: int, L: TensorLike = None, diag: Union[float, tf.Tensor] = None): """ Sample from the joint distribution of $f(X), g(Z)$ via a location-scale transform. """ if diag is None: diag = default_jitter() # Construct joint covariance and compute matrix square root has_multiple_outputs = isinstance(kern, MultioutputKernel) if L is None: if has_multiple_outputs: Kff = kern(Xnew, full_cov=True, full_output_cov=False) else: Kff = kern(Xnew, full_cov=True) Kuu = covariances.Kuu(Z, kern, jitter=0.0) Kuf = covariances.Kuf(Z, kern, Xnew) if isinstance(kern, SharedIndependent) and \ isinstance(Z, SharedIndependentInducingVariables): Kuu = tf.tile(Kuu[None], [Kff.shape[0], 1, 1]) Kuf = tf.tile(Kuf[None], [Kff.shape[0], 1, 1]) K = tf.concat([ tf.concat([Kuu, Kuf], axis=-1), tf.concat([tf.linalg.adjoint(Kuf), Kff], axis=-1) ], axis=-2) K = tf.linalg.set_diag(K, tf.linalg.diag_part(K) + diag) L = tf.linalg.cholesky(K) # Draw samples using a location-scale transform rvs = tf.random.normal(list(L.shape[:-1]) + [num_samples], dtype=floatx()) draws = L @ rvs # [L, M + N, S] or [M + N, S] if not has_multiple_outputs: draws = tf.expand_dims(draws, 0) return tf.split(tf.transpose(draws), [-1, Xnew.shape[0]], axis=-2), L