def test_mixed_mok_with_Id_vs_independent_mok(session_tf): data = DataMixedKernelWithEye # Independent model k1 = mk.SharedIndependentMok(RBF(data.D, variance=0.5, lengthscales=1.2), data.L) f1 = InducingPoints(data.X[:data.M, ...].copy()) m1 = SVGP(data.X, data.Y, k1, Gaussian(), f1, q_mu=data.mu_data_full, q_sqrt=data.sqrt_data_full) m1.set_trainable(False) m1.q_sqrt.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m1, maxiter=data.MAXITER) # Mixed Model kern_list = [ RBF(data.D, variance=0.5, lengthscales=1.2) for _ in range(data.L) ] k2 = mk.SeparateMixedMok(kern_list, data.W) f2 = InducingPoints(data.X[:data.M, ...].copy()) m2 = SVGP(data.X, data.Y, k2, Gaussian(), f2, q_mu=data.mu_data_full, q_sqrt=data.sqrt_data_full) m2.set_trainable(False) m2.q_sqrt.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m2, maxiter=data.MAXITER) check_equality_predictions(session_tf, [m1, m2])
def test_compare_mixed_kernel(session_tf): data = DataMixedKernel kern_list = [RBF(data.D) for _ in range(data.L)] k1 = mk.SeparateMixedMok(kern_list, W=data.W) f1 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy())) m1 = SVGP(data.X, data.Y, k1, Gaussian(), feat=f1, q_mu=data.mu_data, q_sqrt=data.sqrt_data) kern_list = [RBF(data.D) for _ in range(data.L)] k2 = mk.SeparateMixedMok(kern_list, W=data.W) f2 = mf.MixedKernelSharedMof(InducingPoints(data.X[:data.M, ...].copy())) m2 = SVGP(data.X, data.Y, k2, Gaussian(), feat=f2, q_mu=data.mu_data, q_sqrt=data.sqrt_data) check_equality_predictions(session_tf, [m1, m2])
def test_multioutput_with_diag_q_sqrt(session_tf): data = DataMixedKernel q_sqrt_diag = np.ones((data.M, data.L)) * 2 q_sqrt = np.repeat(np.eye(data.M)[None, ...], data.L, axis=0) * 2 # L x M x M kern_list = [RBF(data.D) for _ in range(data.L)] k1 = mk.SeparateMixedMok(kern_list, W=data.W) f1 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy())) m1 = SVGP(data.X, data.Y, k1, Gaussian(), feat=f1, q_mu=data.mu_data, q_sqrt=q_sqrt_diag, q_diag=True) kern_list = [RBF(data.D) for _ in range(data.L)] k2 = mk.SeparateMixedMok(kern_list, W=data.W) f2 = mf.SharedIndependentMof(InducingPoints(data.X[:data.M, ...].copy())) m2 = SVGP(data.X, data.Y, k2, Gaussian(), feat=f2, q_mu=data.mu_data, q_sqrt=q_sqrt, q_diag=False) check_equality_predictions(session_tf, [m1, m2])
def __init__(self, layer_id, kern, U, Z, num_outputs, mean_function, white=False, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, layer_id, U, num_outputs, **kwargs) #Initialize using kmeans self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs self.Z = Z if Z is not None else np.random.normal( 0, 0.01, (100, self.dim_in)) self.num_inducing = self.Z.shape[0] q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(self.Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if not self.white: # initialize to prior Ku = self.kern.compute_K_symm(self.Z) Lu = np.linalg.cholesky(Ku + np.eye(self.Z.shape[0]) * settings.jitter) self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) self.needs_build_cholesky = True
def test_separate_independent_mok(session_tf): """ We use different independent kernels for each of the output dimensions. We can achieve this in two ways: 1) efficient: SeparateIndependentMok with Shared/SeparateIndependentMof 2) inefficient: SeparateIndependentMok with InducingPoints However, both methods should return the same conditional, and after optimization return the same log likelihood. """ # Model 1 (INefficient) q_mu_1 = np.random.randn(Data.M * Data.P, 1) q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P, Data.M * Data.P))[None, ...] # 1 x MP x MP kern_list_1 = [ RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P) ] kernel_1 = mk.SeparateIndependentMok(kern_list_1) feature_1 = InducingPoints(Data.X[:Data.M, ...].copy()) m1 = SVGP(Data.X, Data.Y, kernel_1, Gaussian(), feature_1, q_mu=q_mu_1, q_sqrt=q_sqrt_1) m1.set_trainable(False) m1.q_sqrt.set_trainable(True) m1.q_mu.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER) # Model 2 (efficient) q_mu_2 = np.random.randn(Data.M, Data.P) q_sqrt_2 = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M kern_list_2 = [ RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P) ] kernel_2 = mk.SeparateIndependentMok(kern_list_2) feature_2 = mf.SharedIndependentMof( InducingPoints(Data.X[:Data.M, ...].copy())) m2 = SVGP(Data.X, Data.Y, kernel_2, Gaussian(), feature_2, q_mu=q_mu_2, q_sqrt=q_sqrt_2) m2.set_trainable(False) m2.q_sqrt.set_trainable(True) m2.q_mu.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER) check_equality_predictions(session_tf, [m1, m2])
def __init__(self, kern, Z, num_outputs, mean_function): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :kern: The kernel for the layer (input_dim = D_in) :param q_mu: mean initialization (M, D_out) :param q_sqrt: sqrt of variance initialization (D_out,M,M) :param Z: Inducing points (M, D_in) :param mean_function: The mean function :return: """ Parameterized.__init__(self) M = Z.shape[0] q_mu = np.zeros((M, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile(np.eye(M)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(M, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function
def test_sample_conditional_mixedkernel(session_tf): q_mu = np.random.randn(Data.M, Data.L) # M x L q_sqrt = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.L) ]) # L x M x M Z = Data.X[:Data.M, ...] # M x D N = int(10e5) Xs = np.ones((N, Data.D), dtype=float_type) values = {"Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt} placeholders = _create_placeholder_dict(values) feed_dict = _create_feed_dict(placeholders, values) # Path 1: mixed kernel: most efficient route W = np.random.randn(Data.P, Data.L) mixed_kernel = mk.SeparateMixedMok([RBF(Data.D) for _ in range(Data.L)], W) mixed_feature = mf.MixedKernelSharedMof(InducingPoints(Z.copy())) sample = sample_conditional(placeholders["Xnew"], mixed_feature, mixed_kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=True) value = session_tf.run(sample, feed_dict=feed_dict) # Path 2: independent kernels, mixed later separate_kernel = mk.SeparateIndependentMok( [RBF(Data.D) for _ in range(Data.L)]) shared_feature = mf.SharedIndependentMof(InducingPoints(Z.copy())) sample2 = sample_conditional(placeholders["Xnew"], shared_feature, separate_kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=True) value2 = session_tf.run(sample2, feed_dict=feed_dict) value2 = np.matmul(value2, W.T) # check if mean and covariance of samples are similar np.testing.assert_array_almost_equal(np.mean(value, axis=0), np.mean(value2, axis=0), decimal=1) np.testing.assert_array_almost_equal(np.cov(value, rowvar=False), np.cov(value2, rowvar=False), decimal=1)
def _build_model(self, Y_var, freqs, X, Y, kern_params=None, Z=None, q_mu = None, q_sqrt = None, M=None, P=None, L=None, W=None, num_data=None, jitter=1e-6, tec_scale=None, W_trainable=False, use_mc=False, **kwargs): """ Build the model from the data. X,Y: tensors the X and Y of data Returns: gpflow.models.Model """ settings.numerics.jitter = jitter with gp.defer_build(): # Define the likelihood likelihood = ComplexHarmonicPhaseOnlyGaussianEncodedHetero(tec_scale=tec_scale) # likelihood.variance = 0.3**2#(5.*np.pi/180.)**2 # likelihood_var = log_normal_solve((5.*np.pi/180.)**2, 0.5*(5.*np.pi/180.)**2) # likelihood.variance.prior = LogNormal(likelihood_var[0],likelihood_var[1]**2) # likelihood.variance.transform = gp.transforms.positiveRescale(np.exp(likelihood_var[0])) likelihood.variance.trainable = False q_mu = q_mu/tec_scale #M, L q_sqrt = q_sqrt/tec_scale# L, M, M kern = mk.SeparateMixedMok([self._build_kernel(None, None, None, #kern_params[l].w, kern_params[l].mu, kern_params[l].v, kern_var = np.var(q_mu[:,l]), **kwargs.get("priors",{})) for l in range(L)], W) kern.W.trainable = W_trainable kern.W.prior = gp.priors.Gaussian(W, 0.01**2) feature = mf.MixedKernelSeparateMof([InducingPoints(Z) for _ in range(L)]) mean = Zero() model = HeteroscedasticPhaseOnlySVGP(Y_var, freqs, X, Y, kern, likelihood, feat = feature, mean_function=mean, minibatch_size=None, num_latent = P, num_data = num_data, whiten = False, q_mu = None, q_sqrt = None, q_diag = True) for feat in feature.feat_list: feat.Z.trainable = True #True model.q_mu.trainable = True model.q_mu.prior = gp.priors.Gaussian(0., 0.05**2) model.q_sqrt.trainable = True # model.q_sqrt.prior = gp.priors.Gaussian(0., (0.005/tec_scale)**2) model.compile() tf.summary.image('W',kern.W.constrained_tensor[None,:,:,None]) tf.summary.image('q_mu',model.q_mu.constrained_tensor[None,:,:,None]) # tf.summary.image('q_sqrt',model.q_sqrt.constrained_tensor[:,:,:,None]) return model
def test_sample_conditional(session_tf, whiten, full_cov, full_output_cov): q_mu = np.random.randn(Data.M, Data.P) # M x P q_sqrt = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M Z = Data.X[:Data.M, ...] # M x D Xs = np.ones((Data.N, Data.D), dtype=float_type) feature = InducingPoints(Z.copy()) kernel = RBF(Data.D) values = {"Z": Z, "Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt} placeholders = _create_placeholder_dict(values) feed_dict = _create_feed_dict(placeholders, values) # Path 1 sample_f = sample_conditional(placeholders["Xnew"], feature, kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=whiten, full_cov=full_cov, full_output_cov=full_output_cov, num_samples=int(1e5)) value_f, mean_f, var_f = session_tf.run(sample_f, feed_dict=feed_dict) value_f = value_f.reshape((-1, ) + value_f.shape[2:]) # Path 2 if full_output_cov: pytest.skip( "sample_conditional with X instead of feature does not support full_output_cov" ) sample_x = sample_conditional(placeholders["Xnew"], placeholders["Z"], kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=whiten, full_cov=full_cov, full_output_cov=full_output_cov, num_samples=int(1e5)) value_x, mean_x, var_x = session_tf.run(sample_x, feed_dict=feed_dict) value_x = value_x.reshape((-1, ) + value_x.shape[2:]) # check if mean and covariance of samples are similar np.testing.assert_array_almost_equal(np.mean(value_x, axis=0), np.mean(value_f, axis=0), decimal=1) np.testing.assert_array_almost_equal(np.cov(value_x, rowvar=False), np.cov(value_f, rowvar=False), decimal=1) np.testing.assert_allclose(mean_x, mean_f) np.testing.assert_allclose(var_x, var_f)
def __init__(self, kern, num_outputs, mean_function, Z=None, feature=None, white=False, input_prop_dim=None, q_mu=None, q_sqrt=None, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, input_prop_dim, **kwargs) if feature is None: feature = InducingPoints(Z) self.num_inducing = len(feature) self.feature = feature self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if q_mu is None: q_mu = np.zeros((self.num_inducing, num_outputs), dtype=settings.float_type) self.q_mu = Parameter(q_mu) if q_sqrt is None: if not self.white: # initialize to prior with gpflow.params_as_tensors_for(feature): Ku = conditionals.Kuu(feature, self.kern, jitter=settings.jitter) Lu = tf.linalg.cholesky(Ku) Lu = self.enquire_session().run(Lu) q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) else: q_sqrt = np.tile(np.eye(self.num_inducing, dtype=settings.float_type)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.needs_build_cholesky = True
def _build_model(self, Y_var, X, Y, Z=None, q_mu = None, q_sqrt = None, M=None, P=None, L=None, W=None, num_data=None, jitter=1e-6, tec_scale=None, W_diag=False, **kwargs): """ Build the model from the data. X,Y: tensors the X and Y of data Returns: gpflow.models.Model """ settings.numerics.jitter = jitter with gp.defer_build(): # Define the likelihood likelihood = GaussianTecHetero(tec_scale=tec_scale) q_mu = q_mu/tec_scale #M, L q_sqrt = q_sqrt/tec_scale# L, M, M kern = mk.SeparateMixedMok([self._build_kernel(kern_var = np.var(q_mu[:,l]), **kwargs.get("priors",{})) for l in range(L)], W) if W_diag: # kern.W.transform = Reshape(W.shape,(P,L,L))(gp.transforms.DiagMatrix(L)(gp.transforms.positive)) kern.W.trainable = False else: kern.W.transform = Reshape(W.shape,(P//L,L,L))(MatrixSquare()(gp.transforms.LowerTriangular(L,P//L))) kern.W.trainable = True feature = mf.MixedKernelSeparateMof([InducingPoints(Z) for _ in range(L)]) mean = Zero() model = HeteroscedasticTecSVGP(Y_var, X, Y, kern, likelihood, feat = feature, mean_function=mean, minibatch_size=None, num_latent = P, num_data = num_data, whiten = False, q_mu = q_mu, q_sqrt = q_sqrt) for feat in feature.feat_list: feat.Z.trainable = True model.q_mu.trainable = True model.q_sqrt.trainable = True # model.q_sqrt.prior = gp.priors.Gaussian(q_sqrt, 0.005**2) model.compile() tf.summary.image('W',kern.W.constrained_tensor[None,:,:,None]) tf.summary.image('q_mu',model.q_mu.constrained_tensor[None,:,:,None]) tf.summary.image('q_sqrt',model.q_sqrt.constrained_tensor[:,:,:,None]) return model
def __init__(self, kern, Z, num_outputs, mean_function, **kwargs): """ A sparse variational GP layer with a Gaussian likelihood, where the GP is integrated out :kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param mean_function: The mean function :return: """ Collapsed_Layer.__init__(self, **kwargs) self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs
def __init__(self, Z, mean_function, kern, num_latent=1, whiten=True, name=None): super(Latent, self).__init__(name=name) self.mean_function = mean_function self.kern = kern self.num_latent = num_latent M = Z.shape[0] # M = tf.print(M,[M,'any thing i want'],message='Debug message:',summarize=100) self.feature = InducingPoints(Z) num_inducing = len(self.feature) self.whiten = whiten self.q_mu = Parameter(np.zeros((num_inducing, self.num_latent), dtype=settings.float_type)) q_sqrt = np.tile(np.eye(M)[None, :, :], [self.num_latent, 1, 1]) transform = transforms.LowerTriangular(M, num_matrices=self.num_latent) self.q_sqrt = Parameter(q_sqrt, transform=transform)
def test_sample_conditional(session_tf, whiten): q_mu = np.random.randn(Data.M, Data.P) # M x P q_sqrt = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M Z = Data.X[:Data.M, ...] # M x D Xs = np.ones((int(10e5), Data.D), dtype=float_type) feature = InducingPoints(Z.copy()) kernel = RBF(Data.D) values = {"Z": Z, "Xnew": Xs, "q_mu": q_mu, "q_sqrt": q_sqrt} placeholders = _create_placeholder_dict(values) feed_dict = _create_feed_dict(placeholders, values) # Path 1 sample = sample_conditional(placeholders["Xnew"], placeholders["Z"], kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=whiten) value = session_tf.run(sample, feed_dict=feed_dict) # Path 2 sample2 = sample_conditional(placeholders["Xnew"], feature, kernel, placeholders["q_mu"], q_sqrt=placeholders["q_sqrt"], white=whiten) value2 = session_tf.run(sample2, feed_dict=feed_dict) # check if mean and covariance of samples are similar np.testing.assert_array_almost_equal(np.mean(value, axis=0), np.mean(value2, axis=0), decimal=1) np.testing.assert_array_almost_equal(np.cov(value, rowvar=False), np.cov(value2, rowvar=False), decimal=1)
class SVGP_Layer(Layer): def __init__(self, layer_id, kern, U, Z, num_outputs, mean_function, white=False, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, layer_id, U, num_outputs, **kwargs) #Initialize using kmeans self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs self.Z = Z if Z is not None else np.random.normal( 0, 0.01, (100, self.dim_in)) self.num_inducing = self.Z.shape[0] q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(self.Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if not self.white: # initialize to prior Ku = self.kern.compute_K_symm(self.Z) Lu = np.linalg.cholesky(Ku + np.eye(self.Z.shape[0]) * settings.jitter) self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) self.needs_build_cholesky = True @params_as_tensors def build_cholesky_if_needed(self): # make sure we only compute this once if self.needs_build_cholesky: self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter) self.Lu = tf.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1]) self.needs_build_cholesky = False def conditional_ND(self, X, full_cov=False): self.build_cholesky_if_needed() # mmean, vvar = conditional(X, self.feature.Z, self.kern, # self.q_mu, q_sqrt=self.q_sqrt, # full_cov=full_cov, white=self.white) Kuf = self.feature.Kuf(self.kern, X) A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True) if not self.white: A = tf.matrix_triangular_solve(tf.transpose(self.Lu), A, lower=False) mean = tf.matmul(A, self.q_mu, transpose_a=True) A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :] if self.white: SK = -I else: SK = -self.Ku_tiled if self.q_sqrt is not None: SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) B = tf.matmul(SK, A_tiled) if full_cov: # (num_latent, num_X, num_X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) Kff = self.kern.K(X) else: # (num_latent, num_X) delta_cov = tf.reduce_sum(A_tiled * B, 1) Kff = self.kern.Kdiag(X) # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X) var = tf.expand_dims(Kff, 0) + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) self.build_cholesky_if_needed() KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) if not self.white: KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu))) * self.num_outputs KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled, self.q_sqrt, lower=True))) Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu) KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m) else: KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt)) KL += 0.5 * tf.reduce_sum(self.q_mu**2) return KL
def init_layers(graph_adj, node_feature, kernels, n_layers, all_layers_dim, num_inducing, gc_kernel=True, mean_function="linear", white=False, q_diag=False): assert mean_function in ["linear", "zero"] # mean function must be linear or zero layers = [] # get initial Z sparse_adj = tuple_to_sparse_matrix(graph_adj[0], graph_adj[1], graph_adj[2]) X_running = node_feature.copy() for i in range(n_layers): tf.logging.info("initialize {}th layer".format(i + 1)) dim_in = all_layers_dim[i] dim_out = all_layers_dim[i + 1] conv_X = sparse_adj.dot(X_running) Z_running = kmeans2(conv_X, num_inducing[i], minit="points")[0] kernel = kernels[i] if gc_kernel and kernel.gc_weight: # Z_running = pca(Z_running, kernel.base_kernel.input_dim) # 将维度降到和输出维度一致 X_dim = X_running.shape[1] kernel_input_dim = kernel.base_kernel.input_dim if X_dim > kernel_input_dim: Z_running = pca(Z_running, kernel.base_kernel.input_dim) # 将维度降到和输出维度一致 elif X_dim < kernel_input_dim: Z_running = np.concatenate([Z_running, np.zeros((Z_running.shape[0], kernel_input_dim - X_dim))], axis=1) # print(type(Z_running)) # print(Z_running) if dim_in > dim_out: _, _, V = np.linalg.svd(X_running, full_matrices=False) W = V[:dim_out, :].T elif dim_in < dim_out: W = np.concatenate([np.eye(dim_in), np.zeros((dim_in, dim_out - dim_in))], 1) if mean_function == "zero": mf = Zero() else: if dim_in == dim_out: mf = Identity() else: mf = Linear(W) mf.set_trainable(False) # self.Ku = Kuu(GraphConvolutionInducingpoints(Z_running), kernel, jitter=settings.jitter) # print("successfully calculate Ku") if gc_kernel: feature = GraphConvolutionInducingpoints(Z_running) else: feature = InducingPoints(Z_running) layers.append(svgp_layer(kernel, Z_running, feature, dim_out, mf, gc_kernel, white=white, q_diag=q_diag)) if dim_in != dim_out: # Z_running = Z_running.dot(W) X_running = X_running.dot(W) return layers
def build_model(self, ARGS, X, Y, conditioning=False, apply_name=True, noise_var=None, mean_function=None): if conditioning == False: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: # This is the old way of initializing Zs # M_pad = ARGS.M - N # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) # This is the new way of initializing Zs min_x, max_x = self.bounds[0] min_x = (min_x - self.x_mean) / self.x_std max_x = (max_x - self.x_mean) / self.x_std Z = np.linspace(min_x, max_x, num=ARGS.M) # * X.shape[1]) Z = Z.reshape((-1, X.shape[1])) #print(min_x) #print(max_x) #print(Z) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): # variance initialiaztion lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) # print(Z.shape) # print(ZZ.shape) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX + 1)) # kernel initialization kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) self.layers = layers self.lik = lik # global_step = tf.Variable(0, dtype=tf.int32) # self.global_step = global_step else: lik = self._gp.likelihood layers = self._gp.layers._list # val = self.session.run(self.global_step) # global_step = tf.Variable(val, dtype=tf.int32) # self.global_step = global_step self._gp.clear() with defer_build(): #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor( model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run( tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() sghmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): sghmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), self.sghmc_optimizer.sghmc_step(s), self.sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = self.sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('sghmc'): self.sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='sghmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) # Added jitter due to input matrix invertability problems custom_config = gpflow.settings.get_settings() custom_config.numerics.jitter_level = 1e-8 model.init_op = init_op model.global_step = global_step # build the computation graph for the gradient self.X_placeholder = tf.placeholder(tf.float64, shape=[None, X.shape[1]]) self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder) self.mean_grad = tf.gradients(Fmu, self.X_placeholder) self.var_grad = tf.gradients(Fvar, self.X_placeholder) # calculated the gradient of the mean for the quantile-filtered distribution # print(Fs) # q = np.quantile(Fs, self.quantile, axis=0) # qFs = [f for f in Fs if f < q] # q_mean = np.mean(qFs, axis=0) # q_var = np.var(qFs, axis=0) # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder) # self.qvar_grad = tf.gradients(q_var, self.X_placeholder) return model
def test_separate_independent_mof(session_tf): """ Same test as above but we use different (i.e. separate) inducing features for each of the output dimensions. """ np.random.seed(0) # Model 1 (INefficient) q_mu_1 = np.random.randn(Data.M * Data.P, 1) q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P, Data.M * Data.P))[None, ...] # 1 x MP x MP kernel_1 = mk.SharedIndependentMok( RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P) feature_1 = InducingPoints(Data.X[:Data.M, ...].copy()) m1 = SVGP(Data.X, Data.Y, kernel_1, Gaussian(), feature_1, q_mu=q_mu_1, q_sqrt=q_sqrt_1) m1.set_trainable(False) m1.q_sqrt.set_trainable(True) m1.q_mu.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER) # Model 2 (efficient) q_mu_2 = np.random.randn(Data.M, Data.P) q_sqrt_2 = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M kernel_2 = mk.SharedIndependentMok( RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P) feat_list_2 = [ InducingPoints(Data.X[:Data.M, ...].copy()) for _ in range(Data.P) ] feature_2 = mf.SeparateIndependentMof(feat_list_2) m2 = SVGP(Data.X, Data.Y, kernel_2, Gaussian(), feature_2, q_mu=q_mu_2, q_sqrt=q_sqrt_2) m2.set_trainable(False) m2.q_sqrt.set_trainable(True) m2.q_mu.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER) # Model 3 (Inefficient): an idenitical feature is used P times, # and treated as a separate feature. q_mu_3 = np.random.randn(Data.M, Data.P) q_sqrt_3 = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M kern_list = [ RBF(Data.D, variance=0.5, lengthscales=1.2) for _ in range(Data.P) ] kernel_3 = mk.SeparateIndependentMok(kern_list) feat_list_3 = [ InducingPoints(Data.X[:Data.M, ...].copy()) for _ in range(Data.P) ] feature_3 = mf.SeparateIndependentMof(feat_list_3) m3 = SVGP(Data.X, Data.Y, kernel_3, Gaussian(), feature_3, q_mu=q_mu_3, q_sqrt=q_sqrt_3) m3.set_trainable(False) m3.q_sqrt.set_trainable(True) m3.q_mu.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m3, maxiter=Data.MAXITER) check_equality_predictions(session_tf, [m1, m2, m3])
def test_shared_independent_mok(session_tf): """ In this test we use the same kernel and the same inducing features for each of the outputs. The outputs are considered to be uncorrelated. This is how GPflow handled multiple outputs before the multioutput framework was added. We compare three models here: 1) an ineffient one, where we use a SharedIndepedentMok with InducingPoints. This combination will uses a Kff of size N x P x N x P, Kfu if size N x P x M x P which is extremely inefficient as most of the elements are zero. 2) efficient: SharedIndependentMok and SharedIndependentMof This combinations uses the most efficient form of matrices 3) the old way, efficient way: using Kernel and InducingPoints Model 2) and 3) follow more or less the same code path. """ # Model 1 q_mu_1 = np.random.randn(Data.M * Data.P, 1) # MP x 1 q_sqrt_1 = np.tril(np.random.randn(Data.M * Data.P, Data.M * Data.P))[None, ...] # 1 x MP x MP kernel_1 = mk.SharedIndependentMok( RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P) feature_1 = InducingPoints(Data.X[:Data.M, ...].copy()) m1 = SVGP(Data.X, Data.Y, kernel_1, Gaussian(), feature_1, q_mu=q_mu_1, q_sqrt=q_sqrt_1) m1.set_trainable(False) m1.q_sqrt.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m1, maxiter=Data.MAXITER) # Model 2 q_mu_2 = np.reshape(q_mu_1, [Data.M, Data.P]) # M x P q_sqrt_2 = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M kernel_2 = RBF(Data.D, variance=0.5, lengthscales=1.2) feature_2 = InducingPoints(Data.X[:Data.M, ...].copy()) m2 = SVGP(Data.X, Data.Y, kernel_2, Gaussian(), feature_2, q_mu=q_mu_2, q_sqrt=q_sqrt_2) m2.set_trainable(False) m2.q_sqrt.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m2, maxiter=Data.MAXITER) # Model 3 q_mu_3 = np.reshape(q_mu_1, [Data.M, Data.P]) # M x P q_sqrt_3 = np.array([ np.tril(np.random.randn(Data.M, Data.M)) for _ in range(Data.P) ]) # P x M x M kernel_3 = mk.SharedIndependentMok( RBF(Data.D, variance=0.5, lengthscales=1.2), Data.P) feature_3 = mf.SharedIndependentMof( InducingPoints(Data.X[:Data.M, ...].copy())) m3 = SVGP(Data.X, Data.Y, kernel_3, Gaussian(), feature_3, q_mu=q_mu_3, q_sqrt=q_sqrt_3) m3.set_trainable(False) m3.q_sqrt.set_trainable(True) gpflow.training.ScipyOptimizer().minimize(m3, maxiter=Data.MAXITER) check_equality_predictions(session_tf, [m1, m2, m3])
class SVGP_Layer(Layer): def __init__(self, kern, Z, num_outputs, mean_function, white=False, input_prop_dim=None, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ Layer.__init__(self, input_prop_dim, **kwargs) self.num_inducing = Z.shape[0] q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu) q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) transform = transforms.LowerTriangular(self.num_inducing, num_matrices=num_outputs) self.q_sqrt = Parameter(q_sqrt, transform=transform) self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white #tf.constant(white, shape=(), dtype = tf.bool) #white # if not self.white: # initialize to prior Ku = self.kern.compute_K_symm(Z) Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * settings.jitter) self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1]) self.needs_build_cholesky = True @params_as_tensors def build_cholesky_if_needed(self): # make sure we only compute this once if self.needs_build_cholesky: self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter) self.Lu = tf.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1]) #also compute K_inverse and it's det if not self.white: inp_ = (self.Ku + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter * 10) self.K_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64)) self.needs_build_cholesky = False def conditional_ND(self, X, full_cov=False): self.build_cholesky_if_needed() # mmean, vvar = conditional(X, self.feature.Z, self.kern, # self.q_mu, q_sqrt=self.q_sqrt, # full_cov=full_cov, white=self.white) Kuf = self.feature.Kuf(self.kern, X) A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True) if not self.white: A = tf.matrix_triangular_solve(tf.transpose(self.Lu), A, lower=False) mean = tf.matmul(A, self.q_mu, transpose_a=True) A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :] if self.white: SK = -I else: SK = -self.Ku_tiled if self.q_sqrt is not None: SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) B = tf.matmul(SK, A_tiled) if full_cov: # (num_latent, num_X, num_X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) Kff = self.kern.K(X) else: # (num_latent, num_X) delta_cov = tf.reduce_sum(A_tiled * B, 1) Kff = self.kern.Kdiag(X) # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X) var = tf.expand_dims(Kff, 0) + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var def KL(self): """ The KL divergence from the variational distribution to the prior. Notation in paper is KL[q(u)||p(u)]. OR the alpha-renyi divergence from variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt * q_sqrt^T) to N(0, I) (if whitened) and to N(mu(Z), K(Z)) otherwise, independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) # self.build_cholesky_if_needed() if self.alpha is None: """Get KL regularizer""" KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) if not self.white: # Whitening is relative to the prior. Here, the prior is NOT # whitened, meaning that we have N(0, K(Z,Z)) as prior. KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu))) * self.num_outputs KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled, self.q_sqrt, lower=True))) Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu) KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m) else: KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt)) KL += 0.5 * tf.reduce_sum(self.q_mu**2) return self.weight * KL else: """Get AR regularizer. For the normal, this means log(Normalizing Constant[alpha * eta_q + (1-alpha) * eta_0 ]) - alpha*log(Normalizing Constant[eta_q]) - (1-alpha)*log(Normalizing Constant[eta_0]). NOTE: the 2*pi factor will cancel, as well as the 0.5 * factor. NOTE: q_strt is s.t. q_sqrt * q_sqrt^T = variational variance, i.e. q(v) = N(q_mu, q_sqrt q_sqrt^T). NOTE: self.Lu is cholesky decomp of self.Ku NOTE: self.feature are the inducing points Z, and self.Ku = self.feature.Kuu(kernel), meaning that self.Ku is the kernel matrix computed at the inducing points Z. NOTE: We need the alpha-renyi div between prior and GP-variational posterior for EACH of the GPs in this layer. Shapes: q_sqrt: 13 x 100 x 100 q_mu: 100 x 13 tf.matrix_diag_part(self.q_sqrt): 13 x 100 q_sqrt_inv: 13 x 100 x 100 Ku, Lu: 100 x 100 num_inducing: 100 num_outputs: 13 """ #convenience alpha = self.alpha #INEFFICIENT, can probably be done much better with cholesky solve inp_ = (tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter * 100) q_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64)) #gives Sigma_q^-1 * mu_q q_var_x_q_mu = tf.matmul( q_inv, tf.reshape(self.q_mu, shape=(self.num_outputs, self.num_inducing, 1))) #Get the two log-normalizers for the variational posteriors q_component_1 = 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) q_component_2 = 0.5 * tf.reduce_sum(q_var_x_q_mu * self.q_mu) logZq = (q_component_1 + q_component_2) if not self.white: #prior using self.Lu, still 0 mean fct logZpi = 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(self.Lu)**2)) * self.num_outputs new_Sigma_inv = (alpha * q_inv + (1.0 - alpha) * self.K_inv + tf.eye(self.num_inducing, dtype=tf.float64) * settings.jitter) # + else: logZpi = 0.0 #* self.num_outputs * self.num_inducing - but that is still 0. new_Sigma_inv = (alpha * q_inv + (1.0 - alpha + settings.jitter) * tf.eye(self.num_inducing, dtype=tf.float64)) new_Sigma_inv_chol = tf.cholesky(tf.cast(new_Sigma_inv, tf.float64)) log_det = -tf.reduce_sum( tf.log(tf.matrix_diag_part(new_Sigma_inv_chol)**2)) #Get the new inverse variance of the exponential family member #corresponding to alpha * eta_q + (1-alpha) * eta_0. #var_inv_new = tf.matmul(chol_var_inv_new, chol_var_inv_new, transpose_b=True) #Compute mu_new: Compute (Sigma^-1*mu) = A via #A = alpha* Sigma_q^-1 * q_mu + (1-alpha * 0) and then multiply #both sides by Sigma! => Problem: I don't know sigma! mu_new = tf.linalg.solve( tf.cast(new_Sigma_inv, dtype=tf.float64), tf.cast(alpha * q_var_x_q_mu, dtype=tf.float64)) #Note: Sigma^{-1}_new * mu_new = Sigma^{-1}_q * mu_q, so # mu_new' * Sigma^{-1}_new * mu_new = mu_new' * (Sigma^{-1}_q * mu_q) mu_new_x_new_Sigma_inv = tf.reduce_sum(alpha * q_var_x_q_mu * mu_new) #Observing that log(|Sigma|) = - log(|Sigma|^-1), we can now get #the normalizing constant of the new exp. fam member. logZnew = (0.5 * mu_new_x_new_Sigma_inv + 0.5 * log_det) #return the log of the AR-div between the normals, i.e. # (1/(alpha * (1-alpha))) * log(D), where D = #new normalizer / (q_normalizer^alpha * prior_normalizer^(1-alpha)) AR = (1.0 / (alpha * (1.0 - alpha))) * (logZnew - alpha * logZq - (1.0 - alpha) * logZpi) return self.weight * AR
def build_model(ARGS, X, Y, apply_name=True): if ARGS.mode == 'CVAE': layers = [] for l in ARGS.configuration.split('_'): try: layers.append(int(l)) except: pass with defer_build(): name = 'CVAE' if apply_name else None model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name) model.compile() global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) model.train_op = lambda s: s.run([op_adam, op_increment]) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step model.compile() else: N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit='points')[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] # PX = P.copy() layers = [] # quad_layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split('_'): if c == 'G': num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == 'L': d = int(d) D_in += d layers.append(LatentVariableLayer(d, XY_dim=DX+1)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = 'Model' if apply_name else None if ARGS.mode == 'VI': model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'SGHMC': for layer in layers: if hasattr(layer, 'q_sqrt'): del layer.q_sqrt layer.q_sqrt = None layer.q_mu.set_trainable(False) model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == 'IWAE': model = DGP_IWVI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name) global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) if not ('SGHMC' == ARGS.mode): for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64) lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64) op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list) op_adam = AdamOptimizer(lr).make_optimize_tensor(model) def train(s): s.run(op_increment) s.run(op_ng) s.run(op_adam) model.train_op = train model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step else: model.compile() hmc_vars = [] for layer in layers: if hasattr(layer, 'q_mu'): hmc_vars.append(layer.q_mu.unconstrained_tensor) hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model) sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100) def train_op(s): s.run(op_increment), sghmc_optimizer.sghmc_step(s), sghmc_optimizer.train_hypers(s) model.train_op = train_op model.sghmc_optimizer = sghmc_optimizer def init_op(s): epsilon = 0.01 mdecay = 0.05 with tf.variable_scope('hmc'): sghmc_optimizer.generate_update_step(epsilon, mdecay) v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc') s.run(tf.variables_initializer(v)) s.run(tf.variables_initializer([global_step])) model.init_op = init_op model.global_step = global_step return model
def build_model(ARGS, X, Y, apply_name=True): N, D = X.shape # first layer inducing points if N > ARGS.M: Z = kmeans2(X, ARGS.M, minit="points")[0] else: M_pad = ARGS.M - N Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0) #################################### layers P = np.linalg.svd(X, full_matrices=False)[2] layers = [] DX = D DY = 1 D_in = D D_out = D with defer_build(): lik = Gaussian() lik.variance = ARGS.likelihood_variance if len(ARGS.configuration) > 0: for c, d in ARGS.configuration.split("_"): if c == "G": num_gps = int(d) A = np.zeros((D_in, D_out)) D_min = min(D_in, D_out) A[:D_min, :D_min] = np.eye(D_min) mf = Linear(A=A) mf.b.set_trainable(False) def make_kern(): k = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) k.variance.set_trainable(False) return k PP = np.zeros((D_out, num_gps)) PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)] ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] kern = SharedMixedMok(make_kern(), W=PP) inducing = MixedKernelSharedMof(InducingPoints(ZZ)) l = GPLayer(kern, inducing, num_gps, layer_num=len(layers), mean_function=mf) if ARGS.fix_linear is True: kern.W.set_trainable(False) mf.set_trainable(False) layers.append(l) D_in = D_out elif c == "L": d = int(d) D_in += d encoder_dims = [ int(dim.strip()) for dim in ARGS.encoder_dims.split(",") ] layers.append( LatentVariableLayer(d, XY_dim=DX + 1, encoder_dims=encoder_dims, qz_mode=ARGS.qz_mode)) kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True) ZZ = np.random.randn(ARGS.M, D_in) ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)] layers.append(GPLayer(kern, InducingPoints(ZZ), DY)) #################################### model name = "Model" if apply_name else None if ARGS.mode == "VI": model = DGP_VI(X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, name=name) elif ARGS.mode == "IWAE": model = DGP_IWVI( X=X, Y=Y, layers=layers, likelihood=lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, encoder_minibatch_size=ARGS.encoder_minibatch_size, ) elif ARGS.mode == "CIWAE": model = DGP_CIWAE( X, Y, layers, lik, minibatch_size=ARGS.minibatch_size, num_samples=ARGS.num_IW_samples, name=name, beta=ARGS.beta, ) else: raise ValueError(f"Unknown mode {ARGS.mode}.") global_step = tf.Variable(0, dtype=tf.int32) op_increment = tf.assign_add(global_step, 1) for layer in model.layers[:-1]: if isinstance(layer, GPLayer): layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5 model.compile() #################################### optimization # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural # gradients. if ARGS.use_nat_grad_for_final_layer: # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad # optimiser, which bypasses this flag. model.layers[-1].q_mu.set_trainable(False) model.layers[-1].q_sqrt.set_trainable(False) gamma = tf.cast( tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True), dtype=tf.float64, ) final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]] final_layer_opt_op = NatGradOptimizer( gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars) else: final_layer_opt_op = NoOp() lr = tf.cast( tf.train.exponential_decay(ARGS.lr, global_step, decay_steps=1000, decay_rate=ARGS.lr_decay, staircase=True), dtype=tf.float64, ) encoder_lr = tf.cast( tf.train.exponential_decay( ARGS.encoder_lr, global_step, decay_steps=1000, decay_rate=ARGS.encoder_lr_decay, staircase=True, ), dtype=tf.float64, ) dreg_optimizer = DregOptimizer( enable_dreg=ARGS.use_dreg, optimizer=ARGS.optimizer, encoder_optimizer=ARGS.encoder_optimizer, learning_rate=lr, encoder_learning_rate=encoder_lr, assert_no_nans=ARGS.assert_no_nans, encoder_grad_clip_value=ARGS.clip_encoder_grads, ) other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model) model.lr = lr model.train_op = tf.group(op_increment, final_layer_opt_op, other_layers_opt_op) model.init_op = lambda s: s.run(tf.variables_initializer([global_step])) model.global_step = global_step return model
def _make_part_model(self, X, Y, weights, Z, q_mu, q_sqrt, W, freqs, minibatch_size=None, priors=None): """ Create a gpflow model for a selection of data X: array (N, Din) Y: array (N, P, Nf) weights: array like Y the statistical weights of each datapoint minibatch_size : int Z: list of array (M, Din) The inducing points mean locations. q_mu: list of array (M, L) q_sqrt: list of array (L, M, M) W: array [P,L] freqs: array [Nf,] the freqs priors : dict of priors for the global model Returns: model : gpflow.models.Model """ N, P, Nf = Y.shape _, Din = X.shape assert priors is not None likelihood_var = priors['likelihood_var'] tec_kern_time_ls = priors['tec_kern_time_ls'] tec_kern_dir_ls = priors['tec_kern_dir_ls'] tec_kern_var = priors['tec_kern_var'] tec_mean = priors['tec_mean'] Z_var = priors['Z_var'] P, L = W.shape with defer_build(): # Define the likelihood likelihood = WrappedPhaseGaussianMulti( tec_scale=priors['tec_scale'], freqs=freqs) likelihood.variance = np.exp(likelihood_var[0]) #median as initial likelihood.variance.prior = LogNormal(likelihood_var[0], likelihood_var[1]**2) likelihood.variance.set_trainable(True) def _kern(): kern_thin_layer = ThinLayer(np.array([0., 0., 0.]), priors['tec_scale'], active_dims=slice(2, 6, 1)) kern_time = Matern32(1, active_dims=slice(6, 7, 1)) kern_dir = Matern32(2, active_dims=slice(0, 2, 1)) ### # time kern kern_time.lengthscales = np.exp(tec_kern_time_ls[0]) kern_time.lengthscales.prior = LogNormal( tec_kern_time_ls[0], tec_kern_time_ls[1]**2) kern_time.lengthscales.set_trainable(True) kern_time.variance = 1. #np.exp(tec_kern_var[0]) #kern_time.variance.prior = LogNormal(tec_kern_var[0],tec_kern_var[1]**2) kern_time.variance.set_trainable(False) # ### # directional kern kern_dir.variance = np.exp(tec_kern_var[0]) kern_dir.variance.prior = LogNormal(tec_kern_var[0], tec_kern_var[1]**2) kern_dir.variance.set_trainable(True) kern_dir.lengthscales = np.exp(tec_kern_dir_ls[0]) kern_dir.lengthscales.prior = LogNormal( tec_kern_dir_ls[0], tec_kern_dir_ls[1]**2) kern_dir.lengthscales.set_trainable(True) kern = kern_dir * kern_time #(kern_thin_layer + kern_dir)*kern_time return kern kern = mk.SeparateMixedMok([_kern() for _ in range(L)], W) feature_list = [] for _ in range(L): feat = InducingPoints(Z) #feat.Z.prior = Gaussian(Z,Z_var) feature_list.append(feat) feature = mf.MixedKernelSeparateMof(feature_list) mean = Zero() model = HomoscedasticPhaseOnlySVGP(weights, X, Y, kern, likelihood, feat=feature, mean_function=mean, minibatch_size=minibatch_size, num_latent=P, num_data=N, whiten=False, q_mu=q_mu, q_sqrt=q_sqrt) model.compile() return model