class SVGP_Layer(Layer): def __init__(self, layer_id, kern, U, Z, num_outputs, mean_function, white=False, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, layer_id, U, num_outputs, **kwargs) #Initialize using kmeans self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs self.Z = Z if Z is not None else np.random.normal( 0, 0.01, (100, self.dim_in)) self.num_inducing = self.Z.shape[0] q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(self.Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if not self.white: # initialize to prior Ku = self.kern.compute_K_symm(self.Z) Lu = np.linalg.cholesky(Ku + np.eye(self.Z.shape[0]) * settings.jitter) self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) self.needs_build_cholesky = True @params_as_tensors def build_cholesky_if_needed(self): # make sure we only compute this once if self.needs_build_cholesky: self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter) self.Lu = tf.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1]) self.needs_build_cholesky = False def conditional_ND(self, X, full_cov=False): self.build_cholesky_if_needed() # mmean, vvar = conditional(X, self.feature.Z, self.kern, # self.q_mu, q_sqrt=self.q_sqrt, # full_cov=full_cov, white=self.white) Kuf = self.feature.Kuf(self.kern, X) A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True) if not self.white: A = tf.matrix_triangular_solve(tf.transpose(self.Lu), A, lower=False) mean = tf.matmul(A, self.q_mu, transpose_a=True) A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :] if self.white: SK = -I else: SK = -self.Ku_tiled if self.q_sqrt is not None: SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) B = tf.matmul(SK, A_tiled) if full_cov: # (num_latent, num_X, num_X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) Kff = self.kern.K(X) else: # (num_latent, num_X) delta_cov = tf.reduce_sum(A_tiled * B, 1) Kff = self.kern.Kdiag(X) # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X) var = tf.expand_dims(Kff, 0) + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) self.build_cholesky_if_needed() KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) if not self.white: KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu))) * self.num_outputs KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled, self.q_sqrt, lower=True))) Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu) KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m) else: KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt)) KL += 0.5 * tf.reduce_sum(self.q_mu**2) return KL
class SVGP_Layer(Layer): def __init__(self, kern, Z, num_outputs, mean_function, white=False, input_prop_dim=None, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, input_prop_dim, **kwargs) self.num_inducing = Z.shape[0] q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white #tf.constant(white, shape=(), dtype = tf.bool) #white # if not self.white: # initialize to prior Ku = self.kern.compute_K_symm(Z) Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * settings.jitter) self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) self.needs_build_cholesky = True @params_as_tensors def build_cholesky_if_needed(self): # make sure we only compute this once if self.needs_build_cholesky: self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter) self.Lu = tf.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1]) #also compute K_inverse and it's det if not self.white: inp_ = (self.Ku + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter * 10) self.K_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64)) self.needs_build_cholesky = False def conditional_ND(self, X, full_cov=False): self.build_cholesky_if_needed() # mmean, vvar = conditional(X, self.feature.Z, self.kern, # self.q_mu, q_sqrt=self.q_sqrt, # full_cov=full_cov, white=self.white) Kuf = self.feature.Kuf(self.kern, X) A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True) if not self.white: A = tf.matrix_triangular_solve(tf.transpose(self.Lu), A, lower=False) mean = tf.matmul(A, self.q_mu, transpose_a=True) A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :] if self.white: SK = -I else: SK = -self.Ku_tiled if self.q_sqrt is not None: SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) B = tf.matmul(SK, A_tiled) if full_cov: # (num_latent, num_X, num_X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) Kff = self.kern.K(X) else: # (num_latent, num_X) delta_cov = tf.reduce_sum(A_tiled * B, 1) Kff = self.kern.Kdiag(X) # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X) var = tf.expand_dims(Kff, 0) + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var def KL(self): """ The KL divergence from the variational distribution to the prior. Notation in paper is KL[q(u)||p(u)]. OR the alpha-renyi divergence from variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt * q_sqrt^T) to N(0, I) (if whitened) and to N(mu(Z), K(Z)) otherwise, independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) # self.build_cholesky_if_needed() if self.alpha is None: """Get KL regularizer""" KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) if not self.white: # Whitening is relative to the prior. Here, the prior is NOT # whitened, meaning that we have N(0, K(Z,Z)) as prior. KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu))) * self.num_outputs KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled, self.q_sqrt, lower=True))) Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu) KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m) else: KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt)) KL += 0.5 * tf.reduce_sum(self.q_mu**2) return self.weight * KL else: """Get AR regularizer. For the normal, this means log(Normalizing Constant[alpha * eta_q + (1-alpha) * eta_0 ]) - alpha*log(Normalizing Constant[eta_q]) - (1-alpha)*log(Normalizing Constant[eta_0]). NOTE: the 2*pi factor will cancel, as well as the 0.5 * factor. NOTE: q_strt is s.t. q_sqrt * q_sqrt^T = variational variance, i.e. q(v) = N(q_mu, q_sqrt q_sqrt^T). NOTE: self.Lu is cholesky decomp of self.Ku NOTE: self.feature are the inducing points Z, and self.Ku = self.feature.Kuu(kernel), meaning that self.Ku is the kernel matrix computed at the inducing points Z. NOTE: We need the alpha-renyi div between prior and GP-variational posterior for EACH of the GPs in this layer. Shapes: q_sqrt: 13 x 100 x 100 q_mu: 100 x 13 tf.matrix_diag_part(self.q_sqrt): 13 x 100 q_sqrt_inv: 13 x 100 x 100 Ku, Lu: 100 x 100 num_inducing: 100 num_outputs: 13 """ #convenience alpha = self.alpha #INEFFICIENT, can probably be done much better with cholesky solve inp_ = (tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter * 100) q_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64)) #gives Sigma_q^-1 * mu_q q_var_x_q_mu = tf.matmul( q_inv, tf.reshape(self.q_mu, shape=(self.num_outputs, self.num_inducing, 1))) #Get the two log-normalizers for the variational posteriors q_component_1 = 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) q_component_2 = 0.5 * tf.reduce_sum(q_var_x_q_mu * self.q_mu) logZq = (q_component_1 + q_component_2) if not self.white: #prior using self.Lu, still 0 mean fct logZpi = 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.Lu)**2)) * self.num_outputs new_Sigma_inv = (alpha * q_inv + (1.0 - alpha) * self.K_inv + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter) # + else: logZpi = 0.0 #* self.num_outputs * self.num_inducing - but that is still 0. new_Sigma_inv = (alpha * q_inv + (1.0 - alpha + settings.jitter) * tf.eye(self.num_inducing, dtype=tf.float64)) new_Sigma_inv_chol = tf.cholesky(tf.cast(new_Sigma_inv, tf.float64)) log_det = -tf.reduce_sum( tf.log(tf.matrix_diag_part(new_Sigma_inv_chol)**2)) #Get the new inverse variance of the exponential family member #corresponding to alpha * eta_q + (1-alpha) * eta_0. #var_inv_new = tf.matmul(chol_var_inv_new, chol_var_inv_new, transpose_b=True) #Compute mu_new: Compute (Sigma^-1*mu) = A via #A = alpha* Sigma_q^-1 * q_mu + (1-alpha * 0) and then multiply #both sides by Sigma! => Problem: I don't know sigma! mu_new = tf.linalg.solve( tf.cast(new_Sigma_inv, dtype=tf.float64), tf.cast(alpha * q_var_x_q_mu, dtype=tf.float64)) #Note: Sigma^{-1}_new * mu_new = Sigma^{-1}_q * mu_q, so # mu_new' * Sigma^{-1}_new * mu_new = mu_new' * (Sigma^{-1}_q * mu_q) mu_new_x_new_Sigma_inv = tf.reduce_sum(alpha * q_var_x_q_mu * mu_new) #Observing that log(|Sigma|) = - log(|Sigma|^-1), we can now get #the normalizing constant of the new exp. fam member. logZnew = (0.5 * mu_new_x_new_Sigma_inv + 0.5 * log_det) #return the log of the AR-div between the normals, i.e. # (1/(alpha * (1-alpha))) * log(D), where D = #new normalizer / (q_normalizer^alpha * prior_normalizer^(1-alpha)) AR = (1.0 / (alpha * (1.0 - alpha))) * (logZnew - alpha * logZq - (1.0 - alpha) * logZpi) return self.weight * AR