def __init__(self, variational_model: bool = True, do_monitor: bool = False): self.var = variational_model self.do_monitor = do_monitor if do_monitor: self.monitor_path = "train_log/fit" os.system("rm -rf train_log") if variational_model: self.opt = tf.optimizers.Adam() self.opt_var = NaturalGradient(gamma=0.1) else: self.opt = Scipy()
# * `max_cg_iters`. The maximum number of CG iterations. # * `restart_cg_step`. The frequency with wich the CG resets the internal state to the initial position using current solution vector `v`. # * `v_grad_optimization`. CGLB introduces auxiliary parameter `v`, and by default optimal `v` is found with the CG. However you can include `v` into the list of trainable model parameters. # %% cglb = CGLB( data, kernel=SquaredExponential(), noise_variance=noise, inducing_variable=iv, cg_tolerance=1.0, max_cg_iters=50, restart_cg_iters=50, ) opt = Scipy() # %% [markdown] # We train the model as usual. Variables do not include the $ v $ auxiliary vector. # %% variables = cglb.trainable_variables _ = opt.minimize(cglb.training_loss_closure(compile=False), variables, compile=False, options=dict(maxiter=100)) # %% [markdown] # Below we compare prediction results for different CG tolerances. The `cg_tolerance=None` means that no CG is run to tune the $ v $ vector, and `cg_tolerance=0.01` is much lower value than the one used at the model optimization. # %% [markdown]
ax.legend() plt.show() # %% vgp = VGPWrapper(kernel=kernel_cls(), index_points=X_grid, observation_index_points=X_train, observations=Y_train, vgp_cls=VGPOpperArchambeau, jitter=jitter) # %% optimizer = Scipy() optimizer.minimize(vgp.variational_loss, variables=vgp._vgp.trainable_variables) # %% qf_loc = vgp.mean() qf_scale = vgp.stddev() # %% # m = tf.matmul(vgp.kernel.K(X_train), vgp.q_alpha) # %% fig, ax = plt.subplots() ax.plot(X_grid, r.logit(X_grid),
# shortcuts tfd = tfp.distributions # sensible defaults SUMMARY_DIR = "logs/" SEED = 8888 dataset_seed = 8888 num_features = 1 num_train = 100 num_test = 100 kernel_cls = Matern52 optimizer = Scipy() jitter = 1e-6 num_seeds = 10 # properties of the distribution props = { "mean": tfd.Distribution.mean, "mode": tfd.Distribution.mode, "median": lambda d: d.distribution.quantile(0.5), # "sample": tfd.Distribution.sample, # single sample } def poly(x):
class Trainer(): def __init__(self, variational_model: bool = True, do_monitor: bool = False): self.var = variational_model self.do_monitor = do_monitor if do_monitor: self.monitor_path = "train_log/fit" os.system("rm -rf train_log") if variational_model: self.opt = tf.optimizers.Adam() self.opt_var = NaturalGradient(gamma=0.1) else: self.opt = Scipy() def run(self, model, dataset, epoch: int = 10): num_iter = len(dataset) * epoch #something not trainable set_trainable(model.inducing_variable, False) set_trainable(model.q_mu, False) set_trainable(model.q_sqrt, False) if self.do_monitor: self.create_monitor(model) if self.var: train_iter = iter(dataset) training_loss = model.training_loss_closure(train_iter, compile=True) for step in tf.range(num_iter): self.optimization_step(model, training_loss) self.monitor(step) else: data = dataset.unbatch() self.opt.minimize(model.training_loss_closure(data), variables=model.trainable_variables, options={ "disp": True, "maxiter": 1e3 }) @tf.function def optimization_step(self, model, loss): self.opt.minimize(loss, par_list=model.trainable_variables) self.opt_var.minimize(loss, var_list=[model.q_mu, model.q_sqrt]) def create_monitor(self, model): model_task = ModelToTensorBoard(self.monitor_path, model) self.monitor = Monitor(MonitorTaskGroup([model_task]), period=5) # data_minibatch = ( # tf.data.Dataset.from_tensor_slices(data) # .prefetch(autotune) # .repeat() # .shuffle(N) # .batch(batch_size) # ) #nat grad loop # gamma_start = 1e-2 # deliberately chosen to be too large for this example # gamma_max = 1e-1 # same max value as before # gamma_step = 1e-2 # this is much more aggressive increase # gamma = tf.Variable(gamma_start, dtype=tf.float64) # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max) # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(model, var_list=[[model.q_mu, model.q_sqrt]]) # op_adam = AdamOptimizer(0.001).make_optimize_tensor(model) # op_increment_gamma = tf.assign(gamma, gamma_incremented) # gamma_fallback = 1e-1 # we'll reduce by this factor if there's a cholesky failure # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) # sess.run(tf.variables_initializer([gamma])) # for it in range(1000): # try: # sess.run(op_ng) # sess.run(op_increment_gamma) # except tf.errors.InvalidArgumentError: # g = sess.run(gamma) # print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback)) # sess.run(op_fallback_gamma) # sess.run(op_adam) # if it % 100 == 0: # print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, model.likelihood_tensor])))
from gpflow.utilities import print_summary print_summary(model) # %% [markdown] # The objective function for MDN instances is the `log_marginal_likelihood`, which we use for optimization of the parameters. GPflow ensures that only the variables stored in `Parameter` objects are optimized. For the MDN, the only parameters are the weights and the biases of the neural net. # # We use the `Scipy` optimizer, which is a wrapper around SciPy's L-BFGS optimization algorithm. Note that GPflow supports other TensorFlow optimizers such as `Adam`, `Adagrad`, and `Adadelta` as well. # %% from gpflow.optimizers import Scipy from gpflow.ci_utils import ci_niter Scipy().minimize(tf.function(lambda: -model.log_marginal_likelihood(data)), variables=model.trainable_parameters, options=dict(maxiter=ci_niter(1500))) print("Final Likelihood", model.log_marginal_likelihood(data).numpy()) # %% [markdown] # To evaluate the validity of our model, we draw the posterior density. We also plot $\mu(x)$ of the optimized neural net. Remember that for every $x$ the neural net outputs $M$ means $\mu_m(x)$. These determine the location of the Gaussians. We plot all $M$ means and use their corresponding mixture weight $\pi_m(X)$ to determine their size. Larger dots will have more impact in the Gaussian ensemble. # %% try: from mdn_plotting import plot except: # VS CODE's root directory is GPflow's top-level directory from doc.source.notebooks.tailor.mdn_plotting import plot fig, axes = plt.subplots(1, 2, figsize=(12, 6))
from gpflow.utilities import print_summary print_summary(model) # %% [markdown] # The objective function for MDN instances is the `maximum_log_likelihood_objective`, which we use for optimization of the parameters. GPflow ensures that only the variables stored in `Parameter` objects are optimized. For the MDN, the only parameters are the weights and the biases of the neural net. # # We use the `Scipy` optimizer, which is a wrapper around SciPy's L-BFGS optimization algorithm. Note that GPflow supports other TensorFlow optimizers such as `Adam`, `Adagrad`, and `Adadelta` as well. # %% from gpflow.optimizers import Scipy from gpflow.ci_utils import ci_niter Scipy().minimize( model.training_loss_closure(data, compile=True), model.trainable_variables, options=dict(maxiter=ci_niter(1500)), ) print("Final Likelihood", model.maximum_log_likelihood_objective(data).numpy()) # %% [markdown] # To evaluate the validity of our model, we draw the posterior density. We also plot $\mu(x)$ of the optimized neural net. Remember that for every $x$ the neural net outputs $M$ means $\mu_m(x)$. These determine the location of the Gaussians. We plot all $M$ means and use their corresponding mixture weight $\pi_m(X)$ to determine their size. Larger dots will have more impact in the Gaussian ensemble. # %% try: from mdn_plotting import plot except: # VS CODE's root directory is GPflow's top-level directory from doc.source.notebooks.tailor.mdn_plotting import plot