def train_scipy(m, maxiter=2000, step=True): log_elbo = [] # log_pi = [] def step_callback(step, variables, values): elbo = m.elbo() print('step {} elbo: {}'.format(step, elbo)) log_elbo.append(elbo) # log_pi.append(m.pi.numpy()) opt = gpflow.optimizers.Scipy() if step: _ = opt.minimize( m.training_loss, method="BFGS", variables=m.trainable_variables, options=dict(maxiter=ci_niter(maxiter), disp=True), step_callback=step_callback, compile=True ) else: _ = opt.minimize( m.training_loss, method="BFGS", variables=m.trainable_variables, options=dict(maxiter=ci_niter(maxiter), disp=True), compile=True ) # return (log_elbo, log_pi) return log_elbo
def _optimize_model_with_gradienttape(self,train_data,**kwargs): """ Optimize model using the Tensorflow GradientTape with batch optimization """ # obtain train_dataset and batches num_train_data = train_data[0].shape[0] train_dataset = tf.data.Dataset.from_tensor_slices(train_data) batch_size = kwargs.pop('batch_size',32) prefetch_size = tf.data.experimental.AUTOTUNE shuffle_buffer_size = num_train_data // 2 num_batches_per_epoch = num_train_data // batch_size train_dataset = ( train_dataset.repeat() .prefetch(prefetch_size) .shuffle(buffer_size=shuffle_buffer_size) .batch(batch_size) ) batches = iter(train_dataset) optimizer = kwargs.pop('optimizer',tf.optimizers.Adam()) epochs = kwargs.pop('epochs',100) logging_epoch_freq = kwargs.pop('logging_epoch_freq',1) test_data = kwargs.pop('test_data',None) for epoch in range(epochs): for _ in range(ci_niter(num_batches_per_epoch)): grads=self.stochastic_gradient(next(batches)) optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) epoch_id = epoch + 1 if epoch_id % logging_epoch_freq == 0: if test_data is None: tf.print(f"Epoch {epoch_id}: ELBO (train) {self.model.elbo(train_data)}") else: tf.print(f"Epoch {epoch_id}: ELBO (train) {self.model.elbo(train_data)}; ELBO (test) {self.model.elbo(test_data)}")
def train_exact_heteroskedastic( model: gpflow.models.VGP, optimizer: tf.optimizers = tf.optimizers.Adam(learning_rate=0.1), natgrad_opt: gpflow.optimizers = gpflow.optimizers.NaturalGradient( gamma=1.0), epochs: int = 100, logging_epoch_freq: int = 10): """ Training loop for heteroskedastic GP """ set_trainable(model.q_mu, False) set_trainable(model.q_sqrt, False) set_trainable(model.mean_function, False) loss = list() for epoch in range(ci_niter(epochs)): epoch_id = epoch + 1 natgrad_opt.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) optimizer.minimize(model.training_loss, model.trainable_variables) loss.append(model.training_loss()) if epoch_id % logging_epoch_freq == 0: tf.print(f"Epoch {epoch_id}: LOSS (train) {model.training_loss()}") plt.plot(range(epochs), loss) plt.xlabel('Epoch', fontsize=25) plt.ylabel('Loss', fontsize=25) plt.tight_layout()
def train_model(self, model, t, x, t_hourly_out): # Dataset train_dataset = tf.data.Dataset.from_tensor_slices( (t, x)).repeat().shuffle(buffer_size=t.shape[0], seed=Const.RANDOM_SEED) # Training start = time.time() iter_loglikelihood = SVGaussianProcess.run_optimization( model=model, iterations=ci_niter(self.max_iterations), train_dataset=train_dataset, minibatch_size=self.minibatch_size) end = time.time() logging.info("Training finished after: {:>10} sec".format(end - start)) logging.info("Trained model.\n\n" + str(get_summary(model)) + "\n") # Prediction signal_hourly_out, signal_var_hourly_out = model.predict_y( t_hourly_out) signal_std_hourly_out = tf.sqrt(signal_var_hourly_out) signal_hourly_out = tf.reshape(signal_hourly_out, [-1]).numpy() signal_std_hourly_out = tf.reshape(signal_std_hourly_out, [-1]).numpy() return signal_hourly_out.reshape(-1, 1), signal_std_hourly_out.reshape( -1, 1), iter_loglikelihood
def checkpointing_training_loop( model: gpflow.models.SVGP, batch_size: int, epochs: int, manager: tf.train.CheckpointManager, logging_epoch_freq: int = 100, epoch_var: Optional[tf.Variable] = None, step_var: Optional[tf.Variable] = None, ): tf_optimization_step = tf.function(optimization_step) batches = iter(train_dataset) for epoch in range(epochs): for step in range(ci_niter(num_batches_per_epoch)): tf_optimization_step(model, next(batches)) if step_var is not None: step_var.assign(epoch * num_batches_per_epoch + step + 1) if epoch_var is not None: epoch_var.assign(epoch + 1) epoch_id = epoch + 1 if epoch_id % logging_epoch_freq == 0: ckpt_path = manager.save() tf.print( f"Epoch {epoch_id}: ELBO (train) {model.elbo(data)}, saved at {ckpt_path}" )
def analyze(f, title="Plot"): X, Y, groups = f() Y_data = np.hstack([Y, groups]) likelihood = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(variance=1.0), gpflow.likelihoods.Gaussian(variance=1.0) ]) # model construction (notice that num_latent_gps is 1) natgrad = NaturalGradient(gamma=1.0) adam = tf.optimizers.Adam() kernel = gpflow.kernels.Matern52(lengthscales=0.5) model = gpflow.models.VGP((X, Y_data), kernel=kernel, likelihood=likelihood, num_latent_gps=1) # here's a plot of the raw data. fig, ax = plt.subplots(1, 1, figsize=(12, 6)) _ = ax.plot(X, Y_data, "kx") plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title + '.png') for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) # let's do some plotting! xx = np.linspace(0, 30, 200)[:, None] mu, var = model.predict_f(xx) plt.figure(figsize=(12, 6)) plt.plot(xx, mu, "C0") plt.plot(xx, mu + 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(xx, mu - 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(X, Y, "C1x", mew=2) plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title + ' GP model.png') print_summary(model) # print(type(summary)) # summary.to_markdown(title+'.md') # plt.set_xlim(0, 30) # _ = ax.plot(xx, 2.5 * np.sin(6 * xx) + np.cos(3 * xx), "C2--") # plt.errorbar( # X.squeeze(), # Y.squeeze(), # # yerr=2 * (np.sqrt(NoiseVar)).squeeze(), # marker="x", # lw=0, # elinewidth=1.0, # color="C1", # ) # _ = plt.xlim(-5, 5) return
def simple_training_loop(model: gpflow.models.SVGP, epochs: int = 1, logging_epoch_freq: int = 10): tf_optimization_step = tf.function(optimization_step) batches = iter(train_dataset) for epoch in range(epochs): for _ in range(ci_niter(num_batches_per_epoch)): tf_optimization_step(model, next(batches)) epoch_id = epoch + 1 if epoch_id % logging_epoch_freq == 0: tf.print(f"Epoch {epoch_id}: ELBO (train) {model.elbo(data)}")
def hmcmc(self, model): if logger.isEnabledFor(logging.INFO): logger.info('here in the hmcmc method') #we add priors to the hyperparameters. # tfp.distributions dtype is inferred from parameters - so convert to 64-bit model.kernel.lengthscales.prior = tfd.Gamma(f64(1.0), f64(1.0)) model.kernel.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) model.likelihood.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) #model.mean_function.A.prior = tfd.Normal(f64(0.0), f64(10.0)) #model.mean_function.b.prior = tfd.Normal(f64(0.0), f64(10.0)) num_burnin_steps = ci_niter(500) num_samples = ci_niter(1000) # Note that here we need model.trainable_parameters, not trainable_variables - only parameters can have priors! hmc_helper = gpflow.optimizers.SamplingHelper( model.log_posterior_density, model.trainable_parameters) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=hmc_helper.target_log_prob_fn, num_leapfrog_steps=10, step_size=0.01) adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation( hmc, num_adaptation_steps=10, target_accept_prob=f64(0.75), adaptation_rate=0.1) @tf.function def run_chain_fn(): return tfp.mcmc.sample_chain( num_results=num_samples, num_burnin_steps=num_burnin_steps, current_state=hmc_helper.current_state, kernel=adaptive_hmc, trace_fn=lambda _, pkr: pkr.inner_results.is_accepted, ) samples, traces = run_chain_fn()
def sample_f(self): """ Runs MCMC to sample posterior functions. """ # add priors to the hyperparameters. self.model.kernel.lengthscales.prior = tfd.Gamma(f64(1.0), f64(1.0)) self.model.kernel.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) self.model.likelihood.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) if self.mean_function is not None: self.model.mean_function.A.prior = tfd.Normal(f64(0.0), f64(10.0)) self.model.mean_function.b.prior = tfd.Normal(f64(0.0), f64(10.0)) # sample from the posterior using HMC (required to estimate epistemic uncertainty) num_burnin_steps = ci_niter(300) num_samples = ci_niter(self.num_samples) # Note that here we need model.trainable_parameters, not trainable_variables - only parameters can have priors! self.hmc_helper = gpflow.optimizers.SamplingHelper( self.model.log_posterior_density, self.model.trainable_parameters) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=self.hmc_helper.target_log_prob_fn, num_leapfrog_steps=10, step_size=0.01) adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation( hmc, num_adaptation_steps=10, target_accept_prob=f64(0.75), adaptation_rate=0.1) @tf.function def run_chain_fn(): return tfp.mcmc.sample_chain( num_results=num_samples, num_burnin_steps=num_burnin_steps, current_state=self.hmc_helper.current_state, kernel=adaptive_hmc, trace_fn=lambda _, pkr: pkr.inner_results.is_accepted, ) self.samples, traces = run_chain_fn()
def monitored_training_loop(epochs: int): tf_optimization_step = tf.function(optimization_step) batches = iter(train_dataset) for epoch in range(epochs): for _ in range(ci_niter(num_batches_per_epoch)): batch = next(batches) tf_optimization_step(model, batch) epoch_id = epoch + 1 monitor(epoch, epoch_id=epoch_id, data=data)
def _optimize(self): if logger.isEnabledFor(logging.INFO): logger.info('entering _optimize method') logger.info( f'trainable variables {self.model_cost.trainable_variables}') adam_learning_rate = 0.01 iterations = ci_niter(1000) opt = tf.optimizers.Adam(adam_learning_rate) opt_rev = tf.optimizers.Adam(adam_learning_rate) @tf.function def cost_optimization_step(): opt.minimize(self.model_cost.training_loss, self.model_cost.trainable_variables) @tf.function def rev_optimization_step(): opt_rev.minimize(self.model_rev.training_loss, self.model_rev.trainable_variables) for i in range(iterations): opt_logs = cost_optimization_step() # for i in range(iterations): opt_logs_rev = rev_optimization_step() # opt = gpflow.optimizers.Scipy() # opt_rev = gpflow.optimizers.Scipy() # opt_logs = opt.minimize(self.model_cost.training_loss, # self.model_cost.trainable_variables, # # method='COBYLA', # -BFGS-B', # method='BFGS', # L-BFGS-B', # 'SLSQP', # options=dict(maxiter=ci_niter(2000))) #dict(maxiter=500)) # opt_logs_rev = opt_rev.minimize(self.model_rev.training_loss, # self.model_rev.trainable_variables, # # method='COBYLA', # 'L-BFGS-B', # method='BFGS', # L-BFGS-B', # 'SLSQP', # options=dict(maxiter=ci_niter(2000))) # dict(maxiter=500)) if logger.isEnabledFor(logging.INFO): logger.info(f'opt_logs:\n{opt_logs}') logger.info(f'opt_logs_rev:\n{opt_logs_rev}') if logger.isEnabledFor(logging.WARNING): logger.warning(f'summary cost gp model') logger.warning(f'{tabulate_module_summary(self.model_cost)}') logger.warning(f'summary rev gp model') logger.warning(f'({tabulate_module_summary(self.model_rev)})')
def _optimize_model_with_scipy(self,train_data,**kwargs): """ Optimize model using the Scipy optimizer in a single call """ method=kwargs.pop('method',"l-bfgs-b") disp=kwargs.pop("disp",True) maxiter=kwargs.pop( "maxiter",ci_niter(200)) optimizer = gpf.optimizers.Scipy() optimizer.minimize( self.model.training_loss_closure(train_data), variables=self.model.trainable_variables, method=method, options={"disp": disp, "maxiter": maxiter}, **kwargs )
def gp_model(x_train, y_train, x_test, num_classes): """This function instantiates the gp model and gets the predictions from the model. :param x_train: The training dataset. :param y_train: The training dataset labels. :param x_test: The test dataset. :param num_classes: The number of classes in the dataset. :return: predictions, the predictions from the gp model. :return time_taken: The time taken to train the model.""" data = (x_train, y_train) kernel = gpflow.kernels.SquaredExponential() + gpflow.kernels.Matern12( ) + gpflow.kernels.Exponential() invlink = gpflow.likelihoods.RobustMax(num_classes) likelihood = gpflow.likelihoods.MultiClass(num_classes, invlink=invlink) z = x_train[::5].copy() model = gpflow.models.SVGP(kernel=kernel, likelihood=likelihood, inducing_variable=z, num_latent_gps=num_classes, whiten=True, q_diag=True) set_trainable(model.inducing_variable, False) print('\nInitial parameters:') print_summary(model, fmt="notebook") start = time.time() opt = gpflow.optimizers.Scipy() opt.minimize(model.training_loss_closure(data), model.trainable_variables, options=dict(maxiter=ci_niter(1000))) print('\nParameters after optimization:') print_summary(model, fmt="notebook") end = time.time() time_taken = round(end - start, 2) print('Optimization took {:.2f} seconds'.format(time_taken)) predictions = model.predict_y(x_test)[0] return predictions, time_taken
def repeatMinimization(model, Xtest, Ytest): callback = Callback(model, Xtest, Ytest) opt = gpflow.optimizers.Scipy() # print("Optimising for {} repetitions".format(nRepeats)) for repeatIndex in range(nRepeats): # print(repeatIndex) opt.minimize( model.training_loss, model.trainable_variables, method="L-BFGS-B", tol=1e-11, options=dict(disp=False, maxiter=ci_niter(2000)), step_callback=callback, compile=True, ) return callback
def snelsonDemo(): from matplotlib import pyplot as plt from IPython import embed xtrain, ytrain, xtest, ytest = getTrainingTestData() # run exact inference on training data. exact_model = getRegressionModel(xtrain, ytrain) opt = gpflow.train.ScipyOptimizer() opt.minimize(exact_model, maxiter=ci_niter(2000000)) figA, axes = plt.subplots(1, 1) inds = np.argsort(xtrain.flatten()) axes.plot(xtrain[inds, :], ytrain[inds, :], 'ro') plotPredictions(axes, exact_model, 'g', None) figB, axes = plt.subplots(3, 2) # run sparse model on training data initialized from exact optimal solution. VFEmodel, VFEcb = trainSparseModel(xtrain, ytrain, exact_model, False, xtest, ytest) FITCmodel, FITCcb = trainSparseModel(xtrain, ytrain, exact_model, True, xtest, ytest) print("Exact model parameters \n") printModelParameters(exact_model) print("Sparse model parameters for VFE optimization \n") printModelParameters(VFEmodel) print("Sparse model parameters for FITC optimization \n") printModelParameters(FITCmodel) VFEiters = FITCcb.n_iters VFElog_likelihoods = stretch(len(VFEiters), VFEcb.log_likelihoods) VFEhold_out_likelihood = stretch(len(VFEiters), VFEcb.hold_out_likelihood) plotComparisonFigure(xtrain, VFEmodel, exact_model, axes[0, 0], axes[1, 0], axes[2, 0], VFEiters, VFElog_likelihoods, VFEhold_out_likelihood, "VFE") plotComparisonFigure(xtrain, FITCmodel, exact_model, axes[0, 1], axes[1, 1], axes[2, 1], FITCcb.n_iters, FITCcb.log_likelihoods, FITCcb.hold_out_likelihood, "FITC") axes[0, 0].set_title('VFE', loc='center', fontdict={'fontsize': 22}) axes[0, 1].set_title('FITC', loc='center', fontdict={'fontsize': 22}) embed()
def train_temporal(self, X, Y, iteration): def temporal_elbo(X, Y, full_cov=False): var_exp, kl_priors = [], [] for i, layer, likelihood in zip(list(range(self.num_outputs)), self.temporal_layers, self.likelihoods): meani, vari = layer.conditional(X, full_cov=full_cov) available = ~tf.math.is_nan(Y[:, i]) y = tf.where(available, Y[:, i], 0.) var_expi = likelihood.variational_expectations( meani, vari[:, None], y[:, None]) var_expi = var_expi * tf.cast(tf.where(available, 1., 0.), dtype=var_expi.dtype) var_exp.append(tf.reduce_sum(var_expi)) kl_priors.append(layer.KL()) L, KL = tf.reduce_sum(var_exp), tf.reduce_sum(kl_priors) if self.minibatch_size is not None: num_data = tf.cast(self.num_data, KL.dtype) minibatch_size = tf.cast(self.minibatch_size, KL.dtype) scale = num_data / minibatch_size else: scale = tf.cast(1.0, KL.dtype) return L * scale - KL @tf.function(autograph=False) def optimization_step(optimizer, data): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.trainable_variables) objective = -temporal_elbo(*data) grads = tape.gradient(objective, self.trainable_variables) optimizer.apply_gradients(zip(grads, self.trainable_variables)) return objective def run_adam(data, iterations): adam = tf.optimizers.Adam(0.001) for step in range(iterations): neg_elbo = optimization_step(adam, data) elbo = -neg_elbo if step % 1000 == 0: print(elbo.numpy()) print("Start initial temporal training.") maxiter = ci_niter(iteration) run_adam((X, Y), maxiter) print("Done initial temporal training.")
def repeatMinimization(model, xtest, ytest): callback = Callback(model, xtest, ytest) @tf.function(autograph=False) def objective_closure(): return -model.log_marginal_likelihood() opt = gpflow.optimizers.Scipy() #print("Optimising for {} repetitions".format(nRepeats)) for repeatIndex in range(nRepeats): #print(repeatIndex) opt.minimize(objective_closure, model.trainable_variables, method='L-BFGS-B', tol=1e-11, options=dict(disp=False, maxiter=ci_niter(2000)), step_callback=callback) return callback
def analyze(f, title="Plot", rawplot=True, modelplot=True,summary=True): # Obtain randomly generated data X, Y, groups = f() Y_data = np.hstack([Y, groups]) # Model construction (notice that num_latent_gps is 1) likelihood = gpflow.likelihoods.SwitchedLikelihood( [gpflow.likelihoods.Gaussian(variance=1.0), gpflow.likelihoods.Gaussian(variance=1.0)] ) natgrad = NaturalGradient(gamma=1.0) adam = tf.optimizers.Adam() kernel = gpflow.kernels.Matern52(lengthscales=0.5) model = gpflow.models.VGP((X, Y_data), kernel=kernel, likelihood=likelihood, num_latent_gps=1) for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) # Plot of the raw data. if rawplot: fig, ax = plt.subplots(1, 1, figsize=(12, 6)) _ = ax.plot(X, Y_data, "kx") plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title+'.png') # Plot of GP model if modelplot: xx = np.linspace(0, 30, 200)[:, None] mu, var = model.predict_f(xx) plt.figure(figsize=(12, 6)) plt.plot(xx, mu, "C0") plt.plot(xx, mu + 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(xx, mu - 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(X, Y, "C1x", mew=2) plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title+' GP model.png') if summary: print_summary(model) return model
def train_loop(meta_tasks, num_iter=5): """ Metalearning training loop :param meta_tasks: list of metatasks. :param num_iter: number of iterations of tasks set :returns: a mean function object """ # Initialize mean function mean_function = build_mean_function() # Iterate for several passes over the tasks set for iteration in range(num_iter): ts = time.time() print("Currently in meta-iteration {}".format(iteration)) # Iterate over tasks for i, task in enumerate(meta_tasks): data = task # (X, Y) model = build_model(data, mean_function=mean_function) run_adam(model, ci_niter(100)) print(">>>> iteration took {} ms".format(time.time() - ts)) return mean_function
def fit(self, X, Y, variance=0.001, optimize=True, maxiter=100): print('-- fitting gaussian process on ' + str(X.shape[0]) + ' data --') opt = gpflow.optimizers.Scipy() mean_X = X.mean() std_X = X.std() mean_Y = Y.mean() std_Y = Y.std() X = (X - mean_X) / std_X Y = (Y - mean_Y) / std_Y self.mean_X = mean_X self.std_X = std_X self.mean_Y = mean_Y self.std_Y = std_Y if self.gp_model == 'GPR': model = gpflow.models.GPR(data=(np.array(X, dtype=float), np.array(Y, dtype=float)), kernel=self.k, mean_function=self.mean_function, noise_variance=variance) #model.likelihood.variance.assign(variance) #model.likelihood.variance.fixed = True #if optimize: # opt_logs = opt.minimize(model.training_loss, model.trainable_variables, options=dict(maxiter=maxiter)) self.model = model elif self.gp_model == 'SVGP': data = X, Y MAXITER = ci_niter(2000) #self.model.likelihood.variance.assign(variance) if optimize: opt.minimize( self.model.training_loss_closure(data), variables=self.model.trainable_variables, method="l-bfgs-b", options={"maxiter": MAXITER}, ) return
import gpflow from gpflow.ci_utils import ci_niter import tensorflow as tf import numpy as np nRepeats = ci_niter(50) predict_limits = [-4.0, 4.0] inducing_points_limits = [-1.0, 9] hold_out_limits = [0.20, 0.60] optimization_limits = [18.0, 25.0] def readCsvFile(fileName): return np.loadtxt(fileName).reshape(-1, 1) def getTrainingTestData(): overallX = readCsvFile("data/snelson_train_inputs.dat") overallY = readCsvFile("data/snelson_train_outputs.dat") trainIndices = [] testIndices = [] nPoints = overallX.shape[0] for index in range(nPoints): if index % 4 == 0: trainIndices.append(index) else: testIndices.append(index)
pY[:, 0] - two_sigma, pY[:, 0] + two_sigma, alpha=0.15) lml = m.maximum_log_likelihood_objective().numpy() plt.title("%s (lml = %f)" % (name, lml)) return lml # %% [markdown] # ## Full model # %% gpr = gpflow.models.GPR((X, Y), gpflow.kernels.SquaredExponential()) gpflow.optimizers.Scipy().minimize(gpr.training_loss, gpr.trainable_variables, options=dict(maxiter=ci_niter(1000))) full_lml = plot_model(gpr) # %% [markdown] # ## Upper bounds for sparse variational models # As a first investigation, we compute the upper bound for models trained using the sparse variational GP approximation. # %% Ms = np.arange(4, ci_niter(20, test_n=6), 1) vfe_lml = [] vupper_lml = [] vfe_hyps = [] for M in Ms: Zinit = X[:M, :].copy() vfe = gpflow.models.SGPR((X, Y), gpflow.kernels.SquaredExponential(),
Xtrain, Ytrain, Xtest, Ytest = getTrainingTestData() def getKernel(): return gpflow.kernels.SquaredExponential() # Run exact inference on training data: exact_model = gpflow.models.GPR((Xtrain, Ytrain), kernel=getKernel()) opt = gpflow.optimizers.Scipy() opt.minimize( exact_model.training_loss, exact_model.trainable_variables, method="L-BFGS-B", options=dict(maxiter=ci_niter(20000)), tol=1e-11, ) print("Exact model parameters:") printModelParameters(exact_model) figA, ax = plt.subplots(1, 1) ax.plot(Xtrain, Ytrain, "ro") plotPredictions(ax, exact_model, color="g") # %% def initializeHyperparametersFromExactSolution(sparse_model): sparse_model.likelihood.variance.assign(exact_model.likelihood.variance) sparse_model.kernel.variance.assign(exact_model.kernel.variance)
def run_adam(model, iterations): """ Utility function running the Adam optimiser :param model: GPflow model :param interations: number of iterations """ # Create an Adam Optimiser action logf = [] train_it = iter(train_dataset.batch(minibatch_size)) adam = tf.optimizers.Adam() for step in range(iterations): elbo = -optimization_step(adam, model, next(train_it)) if step % 10 == 0: logf.append(elbo.numpy()) return logf maxiter = ci_niter(10000) logf = run_adam(m, maxiter) plt.figure() plt.plot(np.arange(maxiter)[::10], logf) plt.xlabel('iteration') plt.ylabel('ELBO') plot("Predictions after training") print_summary(m)
np.random.seed(0) tf.random.set_seed(0) N, D = 100, 2 batch_size = 50 # inducing points M = 10 x = np.random.uniform(size=(N, D)) y = np.sin(10 * x[:, :1]) + 5 * x[:, 1:]**2 data = (x, y) inducing_variable = tf.random.uniform((M, D)) adam_learning_rate = 0.01 iterations = ci_niter(5) # %% [markdown] # ### VGP is a GPR # %% [markdown] # The following section demonstrates how natural gradients can turn VGP into GPR *in a single step, if the likelihood is Gaussian*. # %% [markdown] # Let's start by first creating a standard GPR model with Gaussian likelihood: # %% gpr = GPR(data, kernel=gpflow.kernels.Matern52()) # %% [markdown] # The log marginal likelihood of the exact GP model is:
def run_gpr(nout, iterations, ds_single, ages, k1len, k2len, k3len, k4len, df_place): # Input space, rsl normalized to zero mean, unit variance X = np.stack((df_place.lon, df_place.lat, df_place.age), 1) RSL = normalize(df_place.rsl_realresid) #define kernels with bounds k1 = gpf.kernels.Matern32(active_dims=[0, 1]) k1.lengthscales = bounded_parameter(1, 10, k1len) k1.variance = bounded_parameter(0.02, 100, 2) k2 = gpf.kernels.Matern32(active_dims=[2]) k2.lengthscales = bounded_parameter(1, 100000, k2len) k2.variance = bounded_parameter(0.02, 100, 1) k3 = gpf.kernels.Matern32(active_dims=[0, 1]) k3.lengthscales = bounded_parameter(10, 100, k3len) k3.variance = bounded_parameter(0.01, 100, 1) k4 = gpf.kernels.Matern32(active_dims=[2]) k4.lengthscales = bounded_parameter(1, 100000, k4len) k4.variance = bounded_parameter(0.01, 100, 1) k5 = gpf.kernels.White(active_dims=[0, 1, 2]) k5.variance = bounded_parameter(0.01, 100, 1) kernel = (k1 * k2) + (k3 * k4) + k5 ################## BUILD AND TRAIN MODELS ####################### noise_variance = (df_place.rsl_er.ravel())**2 m = GPR_new((X, RSL), kernel=kernel, noise_variance=noise_variance) #Sandwich age of each lat/lon to enable gradient calculation lonlat = df_place[['lon', 'lat']] agetile = np.stack([df_place.age - 10, df_place.age, df_place.age + 10], axis=-1).flatten() xyt_it = np.column_stack([lonlat.loc[lonlat.index.repeat(3)], agetile]) #hardcode indices for speed (softcoded alternative commented out) indices = np.arange(1, len(df_place)*3, 3) # indices = np.where(np.in1d(df_place.age, agetile))[0] iterations = ci_niter(iterations) learning_rate = 0.05 logging_freq = 100 opt = tf.optimizers.Adam(learning_rate) #first optimize without age errs to get slope tf.print('___First optimization___') likelihood = -10000 for i in range(iterations): opt.minimize(m.training_loss, var_list=m.trainable_variables) likelihood_new = m.log_marginal_likelihood() if i % logging_freq == 0: tf.print(f"iteration {i + 1} likelihood {m.log_marginal_likelihood():.04f}") if abs(likelihood_new - likelihood) < 0.001: break likelihood = likelihood_new # Calculate posterior at training points + adjacent age points mean, _ = m.predict_f(xyt_it) # make diagonal matrix of age slope at training points Xgrad = np.diag(np.gradient(mean.numpy(), axis=0)[indices][:,0]) # multipy age errors by gradient Xnigp = np.diag(Xgrad @ np.diag((df_place.age_er/2)**2) @ Xgrad.T) m = GPR_new((X, RSL), kernel=kernel, noise_variance=noise_variance + Xnigp) #reoptimize tf.print('___Second optimization___') opt = tf.optimizers.Adam(learning_rate) for i in range(iterations): opt.minimize(m.training_loss, var_list=m.trainable_variables) likelihood_new = m.log_marginal_likelihood() if i % logging_freq == 0: tf.print(f"iteration {i + 1} likelihood {m.log_marginal_likelihood():.04f}") if abs(likelihood_new - likelihood) < 0.001: break likelihood = likelihood_new ################## INTERPOLATE MODELS ####################### ################## -------------------- ###################### # output space da_zp, da_varp = predict_post_f(nout, ages, ds_single, df_place, m) #interpolate all models onto GPR grid ds_giapriorinterp = interp_likegpr(ds_single, da_zp) # add total prior RSL back into GPR ds_priorplusgpr = da_zp + ds_giapriorinterp ds_varp = da_varp.to_dataset(name='rsl') ds_zp = da_zp.to_dataset(name='rsl') #Calculate data-model misfits & GPR vals at RSL data locations df_place['gpr_posterior'] = df_place.apply(lambda row: ds_select(ds_priorplusgpr, row), axis=1) df_place['gprpost_std'] = df_place.apply(lambda row: ds_select(ds_varp, row), axis=1) df_place['gpr_diff'] = df_place.apply(lambda row: row.rsl - row.gpr_posterior, axis=1) df_place['diffdiv'] = df_place.gpr_diff / df_place.rsl_er k1_l = m.kernel.kernels[0].kernels[0].lengthscales.numpy() k2_l = m.kernel.kernels[0].kernels[1].lengthscales.numpy() k3_l = m.kernel.kernels[1].kernels[0].lengthscales.numpy() k4_l = m.kernel.kernels[1].kernels[1].lengthscales.numpy() return ds_giapriorinterp, ds_zp, ds_priorplusgpr, ds_varp, m.log_marginal_likelihood().numpy(), m, df_place def interp_ds(ds): return ds.interp(age=ds_single.age, lat=ds_single.lat, lon=ds_single.lon) def slice_dataset(ds): return ds.rsl.sel(lat=site[1].lat.unique() , lon=site[1].lon.unique(), method='nearest').sel(age=slice(11500, 0)) def ds_ageselect(ds, row): return ds.rsl.interp(age=[row.age]).age.values[0]
# %% import numpy as np import tensorflow as tf from matplotlib import pyplot as plt import gpflow from gpflow.ci_utils import ci_niter from scipy.cluster.vq import kmeans2 from typing import Dict, Optional, Tuple import tensorflow as tf import tensorflow_datasets as tfds import gpflow from gpflow.utilities import to_default_float iterations = ci_niter(100) # %% [markdown] # ## Convolutional network inside a GPflow model # %% original_dataset, info = tfds.load(name="mnist", split=tfds.Split.TRAIN, with_info=True) total_num_data = info.splits["train"].num_examples image_shape = info.features["image"].shape image_size = tf.reduce_prod(image_shape) batch_size = 32 def map_fn(input_slice: Dict[str, tf.Tensor]):
@tf.function(autograph=False) def objective(): return -model.log_marginal_likelihood() optimizer.minimize(objective, variables=model.trainable_variables, options={'maxiter': 20}) print(f'log likelihood at optimum: {model.log_likelihood()}') # %% [markdown] # Sampling starts with a 'burn in' period. # %% burn = ci_niter(100) thin = ci_niter(10) # %% num_samples = 500 hmc_helper = gpflow.optimizers.SamplingHelper(model.log_marginal_likelihood, model.trainable_parameters) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=hmc_helper.target_log_prob_fn, num_leapfrog_steps=10, step_size=0.01) adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation(hmc, num_adaptation_steps=10,
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # task always e_iso_pi with human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # # We standardise the outputs but leave the inputs unchanged # # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_train[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) per_molecule = np.diag(abs(y_pred - y_test[:, 0])) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
3, invlink=invlink) # Multiclass likelihood Z = X[::5].copy() # inducing inputs m = gpflow.models.SVGP( kernel=kernel, likelihood=likelihood, inducing_variable=Z, num_latent_gps=C, whiten=True, q_diag=True, ) # Only train the variational parameters set_trainable(m.kernel.kernels[1].variance, False) set_trainable(m.inducing_variable, False) print_summary(m, fmt="notebook") # %% [markdown] # #### Running inference # %% opt = gpflow.optimizers.Scipy() opt_logs = opt.minimize(m.training_loss_closure(data), m.trainable_variables, options=dict(maxiter=ci_niter(1000))) print_summary(m, fmt="notebook") # %% plot_posterior_predictions(m, X, Y)