def make_dgp(X, Y, Z, L): D = X.shape[1] Y_mean, Y_std = np.average(Y), np.std(Y) # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(D, lengthscales=1., variance=1.)) # between layer noise (doesn't actually make much difference but we include it anyway) for kernel in kernels[:-1]: kernel += White(D, variance=1e-5) mb = 10000 if X.shape[0] > 10000 else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # same final layer inits we used for the single layer model model.layers[-1].kern.variance = Y_std**2 model.likelihood.variance = Y_std * 0.1 model.layers[-1].mean_function = Constant(Y_mean) model.layers[-1].mean_function.fixed = True # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def _fit(self, X, Y, lik, **kwargs): if len(Y.shape) == 1: Y = Y[:, None] kerns = [] if not self.model: with tf.variable_scope('theta'): for _ in range(self.ARGS.n_layers): if _ == 0: kerns.append( SquaredExponential(X.shape[1], ARD=True, lengthscales=float( X.shape[1])**0.5)) else: kerns.append( SquaredExponential(self.ARGS.inter_dim, ARD=True, lengthscales=float( self.ARGS.inter_dim)**0.5)) lik = MultiClass(10) minibatch_size = self.ARGS.minibatch_size if X.shape[ 0] > self.ARGS.minibatch_size else X.shape[0] self.model = DGP(X=X, Y=Y, n_inducing=self.ARGS.n_inducing, kernels=kerns, likelihood=lik, minibatch_size=minibatch_size, inter_dim=self.ARGS.inter_dim, **kwargs) self.model.reset(X, Y) try: for _ in range(self.ARGS.iterations): self.model.train_hypers() if _ % 50 == 1: print('Iteration {}'.format(_)) self.model.print_sample_performance() except KeyboardInterrupt: # pragma: no cover pass
def _fit(self, X, Y, Xs, Ys, Y_std, lik, **kwargs): if len(Y.shape) == 1: Y = Y[:, None] kerns = [] if not self.model: with tf.variable_scope('theta'): for _ in range(self.ARGS["n_layers"]): kerns.append( SquaredExponential(X.shape[1], ARD=self.ARGS["ard"], lengthscales=float( X.shape[1])**0.5)) minibatch_size = self.ARGS["minibatch_size"] if X.shape[ 0] > self.ARGS["minibatch_size"] else X.shape[0] self.model = DGP(X=X, Y=Y, n_inducing=self.ARGS["num_inducing"], kernels=kerns, likelihood=lik, minibatch_size=minibatch_size, adam_lr=self.ARGS["lr"], **kwargs) self.model.reset(X, Y) try: for _ in range(self.ARGS["iterations"]): self.model.train_hypers() if _ % 50 == 1: print('Iteration {}:'.format(_)) self.model.print_sample_performance() m, v = self.predict(Xs) print( '######## Test set MLL:', np.mean( norm.logpdf(Y_std * Ys, Y_std * m, Y_std * np.sqrt(v)))) except KeyboardInterrupt: # pragma: no cover pass
def make_dgp(L): # kernels = [ckern.WeightedColourPatchConv(RBF(25*1, lengthscales=10., variance=10.), [28, 28], [5, 5], colour_channels=1)] kernels = [RBF(784,lengthscales=10., variance=10.)] for l in range(L-1): kernels.append(RBF(50, lengthscales=10., variance=10.)) model = DGP(X, Y, Z, kernels, gpflow.likelihoods.MultiClass(num_classes), minibatch_size=minibatch_size, num_outputs=num_classes, dropout = 0.0) # start things deterministic for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def make_dgp_as_sgp(kernels): m_dgp = DGP(X, Y, Z, kernels, Gaussian()) #set final layer to sgp m_dgp.layers[-1].kern.lengthscales = ls m_dgp.layers[-1].kern.variance = s m_dgp.likelihood.variance = noise m_dgp.layers[-1].q_mu = q_mu m_dgp.layers[-1].q_sqrt = q_sqrt # set other layers to identity for layer in m_dgp.layers[:-1]: # 1e-6 gives errors of 1e-3, so need to set right down layer.kern.variance.transform._lower = 1e-18 layer.kern.variance = 1e-18 return m_dgp
def make_dgp(X, Y, Z, L): Y_mean, Y_std = np.average(Y), np.std(Y) # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(input_dim=17, ARD=True)) mb = 128 if X.shape[0] > 128 else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # same final layer inits we used for the single layer model #model.layers[-1].kern.variance = Y_std**2 model.likelihood.variance = 0.01 #model.layers[-1].mean_function = Constant(Y_mean) #model.layers[-1].mean_function.fixed = True # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def main(args): datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile2 = open(outname2, 'w') running_loss = 0 running_time = 0 for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] for l in range(args.num_layers): kernels.append(SquaredExponential() + White(variance=1e-5)) dgp_model = DGP(X.shape[1], kernels, Gaussian(variance=0.05), Z, num_outputs=Y.shape[1], num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) optimiser = tf.optimizers.Adam(args.learning_rate) def optimisation_step(model, X, Y): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) obj = -model.elbo(X, Y, full_cov=False) grad = tape.gradient(obj, model.trainable_variables) optimiser.apply_gradients(zip(grad, model.trainable_variables)) def monitored_training_loop(model, train_dataset, logdir, iterations, logging_iter_freq): # TODO: use tensorboard to log trainables and performance tf_optimisation_step = tf.function(optimisation_step) batches = iter(train_dataset) for i in range(iterations): X, Y = next(batches) tf_optimisation_step(model, X, Y) iter_id = i + 1 if iter_id % logging_iter_freq == 0: tf.print( f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}') print('Training DGP model...') t0 = time.time() monitored_training_loop(dgp_model, train_dataset, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() print('Time taken to train: {}'.format(t1 - t0)) outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile2.flush() os.fsync(outfile2.fileno()) running_time += t1 - t0 m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std), 0, b=1 / float(args.test_samples))) print('Average test log likelihood: {}'.format(test_nll)) outfile1.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile1.flush() os.fsync(outfile1.fileno()) running_loss += t1 - t0 outfile1.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Average: {}\n'.format(running_time / args.splits)) outfile1.close() outfile2.close()
kernels = [] for l in range(L): kernels.append(RBF(1, lengthscales=0.2, variance=1)) kernels = [ RBF(1, lengthscales=0.2, variance=1), RBF(2, lengthscales=0.2, variance=1) ] N, M = 50, 25 X = np.random.uniform(0, 1, N)[:, None] Z = np.random.uniform(0, 1, M)[:, None] f = lambda x: 0. if x < 0.5 else 1. Y = np.reshape([f(x) for x in X], X.shape) + np.random.randn(*X.shape) * 1e-2 m_dgp = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1) for layer in m_dgp.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 class CB(object): def __init__(self, model, record_every=10): self.model = model self.i = 0 self.res = [] self.record_every = record_every def cb(self, x): self.i += 1 if self.i % self.record_every == 0: self.model.set_state(x)
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') # get dataset data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] # inducing points via k-means Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] dims = [] # hidden_dim = min(args.max_dim, X.shape[1]) hidden_dim = X.shape[1] if X.shape[1] < args.max_dim else args.max_dim for l in range(args.num_layers): if l == 0: dim = X.shape[1] dims.append(dim) else: dim = hidden_dim dims.append(dim) if args.ard: # SE kernel with lengthscale per dimension kernels.append( SquaredExponential(lengthscale=[1.] * dim) + White(variance=1e-5)) else: # SE kernel with single lengthscale kernels.append( SquaredExponential(lengthscale=1.) + White(variance=1e-5)) # output dim dims.append(Y.shape[1]) dgp_model = DGP(X, Y, Z, dims, kernels, Gaussian(variance=0.05), num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) # ===================================================================== # TRAINING # ===================================================================== optimiser = tf.optimizers.Adam(args.learning_rate) print('Training DGP model...') t0 = time.time() # training loop monitored_training_loop(dgp_model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :], num_samples=args.test_samples) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) means.append(m) vars.append(v) mean_SND = np.concatenate(means, 1) # [S, N, D] var_SND = np.concatenate(vars, 1) # [S, N, D] mean_ND = np.mean(mean_SND, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, mean_SND * Y_std, var_SND**0.5 * Y_std), 0, b=1 / float(args.test_samples))) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
class RegressionModel(object): def __init__(self, lr, max_iterations, n_layers=5, num_inducing=128, minibatch_size=10000, n_posterior_samples=100, ard=True): tf.reset_default_graph() ARGS = { "n_layers": n_layers, "num_inducing": num_inducing, "iterations": max_iterations, "minibatch_size": minibatch_size, "n_posterior_samples": n_posterior_samples, "ard": ard, "lr": lr } self.ARGS = ARGS self.model = None print("================ Regression Model =================") print("ARD is {}".format(self.ARGS["ard"])) def fit(self, X, Y, Xs, Ys, Y_std): lik = Gaussian(np.var(Y, 0)) # Initialize with variance in Y return self._fit(X, Y, Xs, Ys, Y_std, lik) def _fit(self, X, Y, Xs, Ys, Y_std, lik, **kwargs): if len(Y.shape) == 1: Y = Y[:, None] kerns = [] if not self.model: with tf.variable_scope('theta'): for _ in range(self.ARGS["n_layers"]): kerns.append( SquaredExponential(X.shape[1], ARD=self.ARGS["ard"], lengthscales=float( X.shape[1])**0.5)) minibatch_size = self.ARGS["minibatch_size"] if X.shape[ 0] > self.ARGS["minibatch_size"] else X.shape[0] self.model = DGP(X=X, Y=Y, n_inducing=self.ARGS["num_inducing"], kernels=kerns, likelihood=lik, minibatch_size=minibatch_size, adam_lr=self.ARGS["lr"], **kwargs) self.model.reset(X, Y) try: for _ in range(self.ARGS["iterations"]): self.model.train_hypers() if _ % 50 == 1: print('Iteration {}:'.format(_)) self.model.print_sample_performance() m, v = self.predict(Xs) print( '######## Test set MLL:', np.mean( norm.logpdf(Y_std * Ys, Y_std * m, Y_std * np.sqrt(v)))) except KeyboardInterrupt: # pragma: no cover pass def _predict(self, Xs, S): ms, vs = [], [] n = max(len(Xs) / 100, 1) # predict in small batches for xs in np.array_split(Xs, n): m, v = self.model.predict_y(xs, S) ms.append(m) vs.append(v) return np.concatenate(ms, 1), np.concatenate(vs, 1) def predict(self, Xs): ms, vs = self._predict(Xs, self.ARGS["n_posterior_samples"]) m = np.average(ms, 0) v = np.average(vs + ms**2, 0) - m**2 return m, v
class ClassificationModel(object): def __init__(self, layers, inducing): class ARGS: n_layers = layers iterations = 1001 minibatch_size = 256 n_posterior_samples = 100 n_inducing = inducing inter_dim = 98 self.ARGS = ARGS self.model = None def fit(self, X, Y): # lik = Gaussian(np.var(Y, 0)) # initialize with variance in Y lik = None return self._fit(X, Y, lik) def _fit(self, X, Y, lik, **kwargs): if len(Y.shape) == 1: Y = Y[:, None] kerns = [] if not self.model: with tf.variable_scope('theta'): for _ in range(self.ARGS.n_layers): if _ == 0: kerns.append( SquaredExponential(X.shape[1], ARD=True, lengthscales=float( X.shape[1])**0.5)) else: kerns.append( SquaredExponential(self.ARGS.inter_dim, ARD=True, lengthscales=float( self.ARGS.inter_dim)**0.5)) lik = MultiClass(10) minibatch_size = self.ARGS.minibatch_size if X.shape[ 0] > self.ARGS.minibatch_size else X.shape[0] self.model = DGP(X=X, Y=Y, n_inducing=self.ARGS.n_inducing, kernels=kerns, likelihood=lik, minibatch_size=minibatch_size, inter_dim=self.ARGS.inter_dim, **kwargs) self.model.reset(X, Y) try: for _ in range(self.ARGS.iterations): self.model.train_hypers() if _ % 50 == 1: print('Iteration {}'.format(_)) self.model.print_sample_performance() except KeyboardInterrupt: # pragma: no cover pass def _predict(self, Xs, S): ms, vs = [], [] n = max(len(Xs) / 100, 1) # predict in small batches for xs in np.array_split(Xs, n): m, v = self.model.predict_y(xs, S) ms.append(m) vs.append(v) return np.concatenate(ms, 1), np.concatenate( vs, 1) # n_posterior_samples, N_test, D_y def predict(self, Xs): ms, vs = self._predict(Xs, self.ARGS.n_posterior_samples) # the first two moments m = np.average(ms, 0) v = np.average(vs + ms**2, 0) - m**2 return m, v