def test(self): kern1 = RBF(1) kern2 = RBF(2) lik = Gaussian() X = np.zeros((1, 1)) model = DGP(X, X, X, [kern1, kern2], lik) model.compute_log_likelihood()
def make_dgp(X, Y, Z, L): D = X.shape[1] # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = [] #for l in range(L): kernels.append(RBF(5)) kernels.append(RBF(2)) kernels.append(RBF(9)) # between layer noise (doesn't actually make much difference but we include it anyway) #for kernel in kernels[:-1]: # kernel += White(D, variance=1e-5) mb = 1000 if X.shape[0] > 1000 else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=10, minibatch_size=mb) # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def compare_to_single_layer(self, Y, Ys, lik, L, num_outputs=None): kern = Matern52(self.X.shape[1], lengthscales=0.1) m_svgp = SVGP(self.X, Y, kern, lik, Z=self.X, num_latent=num_outputs) m_svgp.q_mu = self.q_mu m_svgp.q_sqrt = self.q_sqrt L_svgp = m_svgp.compute_log_likelihood() mean_svgp, var_svgp = m_svgp.predict_y(self.Xs) test_lik_svgp = m_svgp.predict_density(self.Xs, Ys) pred_m_svgp, pred_v_svgp = m_svgp.predict_f(self.Xs) pred_mfull_svgp, pred_vfull_svgp = m_svgp.predict_f_full_cov(self.Xs) kerns = [] for _ in range(L - 1): kerns.append( Matern52(self.X.shape[1], lengthscales=0.1, variance=2e-6)) kerns.append(Matern52(self.X.shape[1], lengthscales=0.1)) m_dgp = DGP(self.X, Y, self.X, kerns, lik, num_samples=2, num_outputs=num_outputs) m_dgp.layers[-1].q_mu = self.q_mu m_dgp.layers[-1].q_sqrt = self.q_sqrt L_dgp = m_dgp.compute_log_likelihood() mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1) test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1) pred_m_dgp, pred_v_dgp = m_dgp.predict_f(self.Xs, 1) pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov(self.Xs, 1) if L == 1: # these should all be exactly the same atol = 1e-7 rtol = 1e-7 else: # jitter makes these not exactly equal atol = 1e-1 rtol = 1e-2 assert_allclose(L_svgp, L_dgp, rtol=rtol, atol=atol) assert_allclose(mean_svgp, mean_dgp[0], rtol=rtol, atol=atol) assert_allclose(var_svgp, var_dgp[0], rtol=rtol, atol=atol) assert_allclose(test_lik_svgp, test_lik_dgp, rtol=rtol, atol=atol) assert_allclose(pred_m_dgp[0], pred_m_svgp, rtol=rtol, atol=atol) assert_allclose(pred_v_dgp[0], pred_v_svgp, rtol=rtol, atol=atol) assert_allclose(pred_mfull_dgp[0], pred_mfull_svgp, rtol=rtol, atol=atol) assert_allclose(pred_vfull_dgp[0], pred_vfull_svgp, rtol=rtol, atol=atol)
def make_DGP(L, D_problem, D_hidden, X, Y, Z): kernels = [] # First layer kernels.append(RBF(D_problem, lengthscales=0.2, variance=1.) + White(D_problem, variance=1e-5)) for l in range(L-1): k = RBF(D_hidden, lengthscales=0.2, variance=1.) + White(D_hidden, variance=1e-5) kernels.append(k) m_dgp = DGP(X, Y, Z, kernels, Gaussian(), num_samples=10) # init the layers to near determinisic for layer in m_dgp.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return m_dgp
def make_dgp_as_sgp(kernels): m_dgp = DGP(X, Y, Z, kernels, Gaussian()) #set final layer to sgp m_dgp.layers[-1].kern.lengthscales = ls m_dgp.layers[-1].kern.variance = s m_dgp.likelihood.variance = noise m_dgp.layers[-1].q_mu = q_mu m_dgp.layers[-1].q_sqrt = q_sqrt # set other layers to identity for layer in m_dgp.layers[:-1]: # 1e-6 gives errors of 1e-3, so need to set right down layer.kern.variance.transform._lower = 1e-18 layer.kern.variance = 1e-18 return m_dgp
def _fit(self, X, Y, Lik, **kwargs): if X.shape[0] > num_inducing: Z = kmeans2(X, num_inducing, minit='points')[0] else: # pad with random values Z = np.concatenate([X, np.random.randn(num_inducing - X.shape[0], X.shape[1])], 0) if not self.model: kerns = [] for _ in range(2): kerns.append(gpflow.kernels.RBF(X.shape[1], lengthscales=float(X.shape[1])**0.5)) mb_size = minibatch_size if X.shape[0] > 5000 else None self.model = DGP(X, Y, Z, kerns, Lik(), minibatch_size=mb_size, **kwargs) self.model.layers[0].q_sqrt = self.model.layers[0].q_sqrt.read_value() * 1e-5 if isinstance(self.model.likelihood, gpflow.likelihoods.Gaussian): var_list = [[self.model.layers[-1].q_mu, self.model.layers[-1].q_sqrt]] self.model.layers[-1].q_mu.set_trainable(False) self.model.layers[-1].q_sqrt.set_trainable(False) self.ng = gpflow.train.NatGradOptimizer(gamma=gamma).make_optimize_tensor(self.model, var_list=var_list) else: self.ng = None self.adam = gpflow.train.AdamOptimizer(adam_lr).make_optimize_tensor(self.model) iters = iterations self.sess = self.model.enquire_session() else: iters = small_iterations # after first time use fewer iterations # we might have new data self.model.X.assign(X, session=self.sess) self.model.Y.assign(Y, session=self.sess) self.model.layers[0].feature.Z.assign(Z, session=self.sess) self.model.layers[0].q_mu.assign(np.zeros((num_inducing, X.shape[1])), session=self.sess) self.model.layers[0].q_sqrt.assign(1e-5*np.tile(np.eye(num_inducing)[None], [X.shape[1], 1, 1]), session=self.sess) self.model.layers[1].feature.Z.assign(Z, session=self.sess) num_outputs = self.model.layers[1].q_sqrt.shape[0] self.model.layers[1].q_mu.assign(np.zeros((num_inducing, num_outputs)), session=self.sess) self.model.layers[1].q_sqrt.assign(np.tile(np.eye(num_inducing)[None], [num_outputs, 1, 1]), session=self.sess) try: for _ in range(iters): if _ % 100 == 0: print('{} {}'.format(_, self.sess.run(self.model.likelihood_tensor))) if self.ng: self.sess.run(self.ng) self.sess.run(self.adam) except KeyboardInterrupt: pass self.model.anchor(session=self.sess)
class RegressionModel(object): def __init__(self): self.model = None def fit(self, X, Y): class Lik(gpflow.likelihoods.Gaussian): def __init__(self): gpflow.likelihoods.Gaussian.__init__(self) self.variance = initial_likelihood_var return self._fit(X, Y, Lik) def _fit(self, X, Y, Lik, **kwargs): if X.shape[0] > num_inducing: Z = kmeans2(X, num_inducing, minit='points')[0] else: # pad with random values Z = np.concatenate([X, np.random.randn(num_inducing - X.shape[0], X.shape[1])], 0) if not self.model: kerns = [] for _ in range(2): kerns.append(gpflow.kernels.RBF(X.shape[1], lengthscales=float(X.shape[1])**0.5)) mb_size = minibatch_size if X.shape[0] > 5000 else None self.model = DGP(X, Y, Z, kerns, Lik(), minibatch_size=mb_size, **kwargs) self.model.layers[0].q_sqrt = self.model.layers[0].q_sqrt.read_value() * 1e-5 if isinstance(self.model.likelihood, gpflow.likelihoods.Gaussian): var_list = [[self.model.layers[-1].q_mu, self.model.layers[-1].q_sqrt]] self.model.layers[-1].q_mu.set_trainable(False) self.model.layers[-1].q_sqrt.set_trainable(False) self.ng = gpflow.train.NatGradOptimizer(gamma=gamma).make_optimize_tensor(self.model, var_list=var_list) else: self.ng = None self.adam = gpflow.train.AdamOptimizer(adam_lr).make_optimize_tensor(self.model) iters = iterations self.sess = self.model.enquire_session() else: iters = small_iterations # after first time use fewer iterations # we might have new data self.model.X.assign(X, session=self.sess) self.model.Y.assign(Y, session=self.sess) self.model.layers[0].feature.Z.assign(Z, session=self.sess) self.model.layers[0].q_mu.assign(np.zeros((num_inducing, X.shape[1])), session=self.sess) self.model.layers[0].q_sqrt.assign(1e-5*np.tile(np.eye(num_inducing)[None], [X.shape[1], 1, 1]), session=self.sess) self.model.layers[1].feature.Z.assign(Z, session=self.sess) num_outputs = self.model.layers[1].q_sqrt.shape[0] self.model.layers[1].q_mu.assign(np.zeros((num_inducing, num_outputs)), session=self.sess) self.model.layers[1].q_sqrt.assign(np.tile(np.eye(num_inducing)[None], [num_outputs, 1, 1]), session=self.sess) try: for _ in range(iters): if _ % 100 == 0: print('{} {}'.format(_, self.sess.run(self.model.likelihood_tensor))) if self.ng: self.sess.run(self.ng) self.sess.run(self.adam) except KeyboardInterrupt: pass self.model.anchor(session=self.sess) def _predict(self, Xs, S): ms, vs = [], [] n = max(len(Xs) / 100, 1) # predict in small batches for xs in np.array_split(Xs, n): m, v = self.model.predict_y(xs, S, session=self.sess) ms.append(m) vs.append(v) return np.concatenate(ms, 1), np.concatenate(vs, 1) # num_posterior_samples, N_test, D_y def predict(self, Xs): ms, vs = self._predict(Xs, num_posterior_samples) # the first two moments m = np.average(ms, 0) v = np.average(vs + ms**2, 0) - m**2 return m, v def sample(self, Xs, S): ms, vs = self._predict(Xs, S) return ms + vs**0.5 * np.random.randn(*ms.shape)
def _build_dgp_model(self, depth, sess, weight, X, Y, ls_scale, y_scale, minibatch_size=500, Z=None, M=100, feature_trainable=False, ls_init=(None, None, None), ls_trainable=(True, True, True), likelihood_var_trainable=True, verbose=False): """ Build svgp model """ N, num_latent = Y.shape Z = kmeans2(X, M, minit='points')[0] with gp.defer_build(): k_time = gp.kernels.RBF( 1, active_dims=[0], lengthscales=[ 0.3 if ls_init[0] is None else ls_init[0] / ls_scale[0] ]) k_space = gp.kernels.RBF( 2, active_dims=[1, 2], lengthscales=[ 0.3 if ls_init[1] is None else ls_init[1] / ls_scale[1] ]) k_freq = gp.kernels.RBF( 1, active_dims=[3], lengthscales=[ 10. if ls_init[2] is None else ls_init[2] / ls_scale[2] ]) for k, f in zip([k_time, k_space, k_freq], ls_trainable): k.lengthscales.set_trainable(f) if not f: logging.warning("Setting {} non-trainable".format(k)) k_time.lengthscales.prior = gp.priors.Gaussian(0, 1. / 3.) k_space.lengthscales.prior = gp.priors.Gaussian( 1. / ls_scale[1], 0.5 / ls_scale[1]) kern = k_time * k_space * k_freq mean = gp.mean_functions.Zero() kernels = [kern] for l in range(1, depth): kernels.append( RBF(4 - l, lengthscales=2., variance=2., ARD=True)) #kernels[-1].lengthscales.prior = gp.priors.Gaussian(0,1./3.) m = DGP(X, Y, Z, kernels, gp.likelihoods.Gaussian(), minibatch_size=minibatch_size, num_outputs=num_latent, num_samples=1) # start things deterministic for layer in m.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 for layer in m.layers: layer.feature.Z.set_trainable(feature_trainable) m.compile() if verbose: logging.warning(m) return m
def get_test_error(i, dataset, alpha, learning_rate=0.001, iterations=20000, white=True, normalized=True, num_inducing=100, beta=None, gamma=None, div_weights=None): """STEP (1) Read in the data via the helpful 'Dataset' object""" data = datasets.all_datasets[dataset].get_data(seed=0, split=i, prop=0.9) X_train, Y_train, X_test, Y_test, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] print('N: {}, D: {}, Ns: {}, Y_std: {}'.format(X_train.shape[0], X_train.shape[1], X_test.shape[0], Y_std)) Z = kmeans2(X_train, num_inducing, minit='points')[0] #Dimensionality of X D = X_train.shape[1] # the layer shapes are defined by the kernel dims, so here all # hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(D)) # between layer noise (doesn't actually make much difference but we include it anyway) for kernel in kernels[:-1]: kernel += White(D, variance=1e-5) mb = 1000 if X_train.shape[0] > 1000 else None # get the likelihood model (possibly a robust one) if gamma is None and beta is None: #standard likelihood lklh = Gaussian() elif beta is not None and gamma is None: #beta-divergence robustified likelihood lklh = betaDivGaussian(beta) elif gamma is not None and beta is None: #gamma-divergeece robustified likelihood lklh = gammaDivGaussian(gamma) else: print( "ERROR! You have specified both beta and gamma. Either specify " + "both as None (for standard Gaussian likelihood) or one of them " + "as None (to use the other)") sys.exit() """STEP (2): Call 'DGP' for split i, which together with ADAM is responsible for the inference""" model = DGP( X_train, Y_train, Z, kernels, lklh, #Gaussian(), #betaDivGaussian(0.01), #Gaussian(), #betaDivGaussian(0.1), #Gaussian(), #Gaussian_(), #gammaDivGaussian(0.1), #Gaussian_(), #gammaDivGaussian(0.01), #gammaDivGaussian(0.1), #Gaussian(), num_samples=K, minibatch_size=mb, alpha=alpha, white=white, div_weights=div_weights) # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 #Build functions for evaluation test errors S = 100 def batch_assess(model, assess_model, X, Y): n_batches = max(int(X.shape[0] / 1000.), 1) lik, sq_diff = [], [] for X_batch, Y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): l, sq = assess_model(model, X_batch, Y_batch) lik.append(l) sq_diff.append(sq) lik = np.concatenate(lik, 0) sq_diff = np.array(np.concatenate(sq_diff, 0), dtype=float) return np.average(lik), np.average(sq_diff)**0.5 def assess_single_layer(model, X_batch, Y_batch): m, v = model.predict_y(X_batch) lik = np.sum( norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5), 1) sq_diff = Y_std**2 * ((m - Y_batch)**2) return lik, sq_diff def assess_sampled(model, X_batch, Y_batch): m, v = model.predict_y(X_batch, S) S_lik = np.sum( norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5), 2) lik = logsumexp(S_lik, 0, b=1 / float(S)) mean = np.average(m, 0) sq_diff = Y_std**2 * ((mean - Y_batch)**2) return lik, sq_diff #Get start time start_time = time.time() #Fit to training set via ADAM np.random.seed(1) AdamOptimizer(learning_rate).minimize(model, maxiter=iterations) #get running time running_time = time.time() - start_time s = 'time: {:.4f}, lik: {:.4f}, rmse: {:.4f}' """STEP (3): Extract and return test performancee metrics to 'main'.""" #Get test errors lik, rmse = batch_assess(model, assess_sampled, X_test, Y_test) print(s.format(running_time, lik, rmse)) return -lik, rmse, running_time
#np.save('plots/svgp',m_sgp) Yp_mat = np.reshape(m_sgp, (n, n)) # doing this for heat map Yp_mat_tr = np.transpose(np.fliplr(Yp_mat)) diff_sgp = m_sgp - Y_truth # difference between truth and predicted values diff_mat = Yt_mat_tr - Yp_mat_tr ############################################################################ # Prediction using one single layer DGP kernels = [RBF(d, lengthscales=0.2, variance=1.)] m_dgp = DGP( X, Y, Z, kernels, Gaussian_lik(), minibatch_size=None ) # Making a one single layer DGP model using DGP in doubly_stochastic_dgp package m_dgp.likelihood.likelihood.variance = 1e-2 for layer in m_dgp.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 # initializing covariance matrix of variational distribution q(u) (in each layer) iterations = 2000 start_time = time.time() ScipyOptimizer().minimize(m_dgp, maxiter=iterations) #AdamOptimizer(0.001).minimize(m_dgp, maxiter=iterations) #Scipy is much better than Adam here for one single layer t_op = time.time() - start_time print("--- %s seconds ---" % (t_op)) # 21.741 seconds --- m_dgp.read_trainables()
dataset_name, L, split)) print('N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0])) Z = kmeans2(X, 100, minit='points')[0] D = X.shape[1] kernels = [] for l in range(L): kernels.append(RBF(D)) for kernel in kernels[:-1]: kernel += White(D, variance=2e-6) mb = minibatch_size if X.shape[0] > minibatch_size else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 model.likelihood.variance = 0.05 global_step = tf.Variable(0, trainable=False, name="global_step") model.enquire_session().run(global_step.initializer) s = "{}/{}_L{}_split{}".format(results_path, dataset_name, L, split) fw = tf.summary.FileWriter(os.path.join(s.format(dataset_name, L)), model.enquire_session().graph) opt_method = gpflow_monitor.ManagedOptimisation(model, AdamOptimizer(0.01), global_step)
def test_vs_DGP2(self): lik = Gaussian() lik_var = 0.1 lik.variance = lik_var N, Ns, D_Y, D_X = self.X.shape[0], self.Xs.shape[ 0], self.D_Y, self.X.shape[1] q_mu = np.random.randn(N, D_X) Y = np.random.randn(N, D_Y) Ys = np.random.randn(Ns, D_Y) kern1 = Matern52(self.X.shape[1], lengthscales=0.5) kern2 = Matern52(self.X.shape[1], lengthscales=0.5) kerns = [kern1, kern2] # mf = Linear(A=np.random.randn(D_X, D_Y), b=np.random.randn(D_Y)) mf = Zero() m_dgp = DGP(self.X, Y, self.X, kerns, lik, mean_function=mf, white=True) m_dgp.layers[0].q_mu = q_mu m_dgp.layers[0].q_sqrt = m_dgp.layers[0].q_sqrt.read_value( ) * 1e-24 Fs, ms, vs = m_dgp.predict_all_layers(self.Xs, 1) Z = self.X.copy() Z[:len(self.Xs)] = ms[0][0] m_dgp.layers[ 1].feature.Z = Z # need to put the inducing points in the right place var_list = [[m_dgp.layers[1].q_mu, m_dgp.layers[1].q_sqrt]] NatGradOptimizer(gamma=1).minimize(m_dgp, var_list=var_list, maxiter=1) mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1) test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1) pred_m_dgp, pred_v_gpr = m_dgp.predict_f(self.Xs, 1) pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov( self.Xs, 1) # mean_functions = [Identity(), mf] layer0 = GPMC_Layer(kerns[0], self.X.copy(), D_X, Identity()) layer1 = GPR_Layer(kerns[1], mf, D_Y) m_heinonen = DGP_Heinonen(self.X, Y, lik, [layer0, layer1]) m_heinonen.layers[0].q_mu = q_mu mean_heinonen, var_heinonen = m_heinonen.predict_y(self.Xs, 1) test_lik_heinonen = m_heinonen.predict_density(self.Xs, Ys, 1) pred_m_heinonen, pred_v_heinonen = m_heinonen.predict_f(self.Xs, 1) pred_mfull_heinonen, pred_vfull_heinonen = m_heinonen.predict_f_full_cov( self.Xs, 1) tol = 1e-4 assert_allclose(mean_dgp, mean_heinonen, atol=tol, rtol=tol) assert_allclose(test_lik_dgp, test_lik_heinonen, atol=tol, rtol=tol) assert_allclose(pred_m_dgp, pred_m_heinonen, atol=tol, rtol=tol) assert_allclose(pred_mfull_dgp, pred_mfull_heinonen, atol=tol, rtol=tol) assert_allclose(pred_vfull_dgp, pred_vfull_heinonen, atol=tol, rtol=tol)
def compare_to_single_layer(self, Y, Ys, lik, L, white, num_outputs=None): kern = Matern52(self.X.shape[1], lengthscales=0.5) m_svgp = SVGP(self.X, Y, kern, lik, Z=self.X, whiten=white, num_latent=num_outputs) m_svgp.q_mu = self.q_mu m_svgp.q_sqrt = self.q_sqrt L_svgp = m_svgp.compute_log_likelihood() mean_svgp, var_svgp = m_svgp.predict_y(self.Xs) test_lik_svgp = m_svgp.predict_density(self.Xs, Ys) pred_m_svgp, pred_v_svgp = m_svgp.predict_f(self.Xs) pred_mfull_svgp, pred_vfull_svgp = m_svgp.predict_f_full_cov( self.Xs) kerns = [] for _ in range(L - 1): class NoTransformMatern52(Matern52): def __init__(self, *args, variance=1., **kwargs): Matern52.__init__(self, *args, **kwargs) del self.variance self.variance = Parameter(variance) kerns.append( NoTransformMatern52(self.X.shape[1], variance=1e-24, lengthscales=0.5)) kerns.append(kern) m_dgp = DGP(self.X, Y, self.X, kerns, lik, white=white, num_samples=2, num_outputs=num_outputs) m_dgp.layers[-1].q_mu = self.q_mu m_dgp.layers[-1].q_sqrt = self.q_sqrt L_dgp = m_dgp.compute_log_likelihood() mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1) test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1) pred_m_dgp, pred_v_dgp = m_dgp.predict_f(self.Xs, 1) pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov( self.Xs, 1) if L == 1: # these should all be exactly the same atol = 1e-7 rtol = 1e-7 else: # jitter makes these not exactly equal atol = 1e-6 rtol = 1e-6 assert_allclose(L_svgp, L_dgp, rtol=rtol, atol=atol) assert_allclose(mean_svgp, mean_dgp[0], rtol=rtol, atol=atol) assert_allclose(var_svgp, var_dgp[0], rtol=rtol, atol=atol) assert_allclose(test_lik_svgp, test_lik_dgp, rtol=rtol, atol=atol) assert_allclose(pred_m_dgp[0], pred_m_svgp, rtol=rtol, atol=atol) assert_allclose(pred_v_dgp[0], pred_v_svgp, rtol=rtol, atol=atol) assert_allclose(pred_mfull_dgp[0], pred_mfull_svgp, rtol=rtol, atol=atol) assert_allclose(pred_vfull_dgp[0], pred_vfull_svgp, rtol=rtol, atol=atol)