def test_periodicity_group(capsys): """Test that groups are called at different periods.""" task_a = ExecuteCallback(lambda: print("a", end=" ")) task_b = ExecuteCallback(lambda: print("b", end=" ")) task_X = ExecuteCallback(lambda: print("X", end=" ")) group_often = MonitorTaskGroup([task_a, task_b], period=1) group_seldom = MonitorTaskGroup([task_X], period=3) monitor = Monitor(group_often, group_seldom) for i in range(7): monitor(i) out, _ = capsys.readouterr() expected = "a b X a b a b a b X a b a b a b X " assert out == expected # AutoGraph mode compiled_monitor = tf.function(monitor) for i in tf.range(7): compiled_monitor(i) # When using TF's range and compiling the monitoring we only expected the python prints once. out, _ = capsys.readouterr() assert "a b X"
def test_MonitorTaskGroup_and_Monitor(task_or_tasks): group = MonitorTaskGroup(task_or_tasks, period=2) # check that the tasks is actually a list (custom setter) isinstance(group.tasks, list) # Smoke test the __call__ group(0) compiled_group = tf.function(group) compiled_group(0) # Smoke test the Monitor wrapper monitor = Monitor(group) monitor(0) compiled_monitor = tf.function(monitor) compiled_monitor(0)
def monitored_training_loop(model, training_loss, epochs: int = 1, num_batches_per_epoch: int = 1, fast_tasks: gpf.monitor.MonitorTaskGroup = None, slow_tasks: gpf.monitor.MonitorTaskGroup = None, logging_epoch_freq: int = 100, manager: tf.train.CheckpointManager = None): """Monitors (with images) Adam optimizer on model with training_loss. Monitoring is not inside tf.function so this method will be slower than monitored_training_tf_loop. :param model: The model to be trained. :param training_loss: A function that returns the training objective. :param epochs: The number of full data passes (epochs). :param num_batches_per_epoch: The number of batches per epoch :param fast_tasks: gpflow monitor fast tasks e.g. MonitorTaskGroup([ScalarToTensorBoard(log_dir, training_loss, "elbo")]) :param slow_tasks: gpflow monitor slow tasks e.g. plotting images :param logging_epoch_freq: The epoch frequency that the training loss is printed. """ optimizer = tf.optimizers.Adam() # checkpoint_path = "training_2/cp-{epoch:04d}.ckpt" # checkpoint_dir = os.path.dirname(checkpoint_path) @tf.function def tf_optimization_step(): optimizer.minimize(training_loss, model.trainable_variables) monitor = Monitor(fast_tasks, slow_tasks) t = time.time() for epoch in range(epochs): for _ in range(num_batches_per_epoch): tf_optimization_step() # duration = t - time.time() # print("Iteration duration: ", duration) # t = time.time() monitor(epoch) epoch_id = epoch + 1 if epoch_id % logging_epoch_freq == 0: tf.print(f"Epoch {epoch_id}: ELBO (train) {training_loss()}") if manager is not None: manager.save()
def test_ExecuteCallback_arguments(capsys): def cb1(x=None, **_): assert x is not None print(x) def cb2(**_): print(2) def cb3(y=None, **_): assert y is not None print(y) group1 = MonitorTaskGroup([ExecuteCallback(cb1), ExecuteCallback(cb2)]) group2 = MonitorTaskGroup(ExecuteCallback(cb3)) monitor = Monitor(group1, group2) monitor(0, x=1, y=3) out, _ = capsys.readouterr() assert out == "1\n2\n3\n"
def create_monitor(self, model): model_task = ModelToTensorBoard(self.monitor_path, model) self.monitor = Monitor(MonitorTaskGroup([model_task]), period=5) # data_minibatch = ( # tf.data.Dataset.from_tensor_slices(data) # .prefetch(autotune) # .repeat() # .shuffle(N) # .batch(batch_size) # ) #nat grad loop # gamma_start = 1e-2 # deliberately chosen to be too large for this example # gamma_max = 1e-1 # same max value as before # gamma_step = 1e-2 # this is much more aggressive increase # gamma = tf.Variable(gamma_start, dtype=tf.float64) # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max) # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(model, var_list=[[model.q_mu, model.q_sqrt]]) # op_adam = AdamOptimizer(0.001).make_optimize_tensor(model) # op_increment_gamma = tf.assign(gamma, gamma_incremented) # gamma_fallback = 1e-1 # we'll reduce by this factor if there's a cholesky failure # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) # sess.run(tf.variables_initializer([gamma])) # for it in range(1000): # try: # sess.run(op_ng) # sess.run(op_increment_gamma) # except tf.errors.InvalidArgumentError: # g = sess.run(gamma) # print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback)) # sess.run(op_fallback_gamma) # sess.run(op_adam) # if it % 100 == 0: # print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, model.likelihood_tensor])))
def monitor(model, tmp_path): tmp_path = str(tmp_path) def lml_callback(): return model.log_marginal_likelihood() def print_callback(): print("foo") return Monitor( MonitorTaskGroup( [ ModelToTensorBoard(tmp_path, model), ScalarToTensorBoard(tmp_path, lml_callback, "lml"), ], period=2, ), MonitorTaskGroup(ExecuteCallback(print_callback), period=1), )
def monitored_training_tf_loop(model, training_loss, epochs: int = 1, num_batches_per_epoch: int = 1, fast_tasks: gpf.monitor.MonitorTaskGroup = None, logging_epoch_freq: int = 100, manager: tf.train.CheckpointManager = None): """Monitors Adam optimizer on model with training_loss. Both training and monitoring are inside tf.function (no image monitoring). This method only monitors the fast tasks as matplotlib code cannot be built in a TF graph. :param model: The model to be trained. :param training_loss: A function that returns the training objective. :param epochs: The number of full data passes (epochs). :param num_batches_per_epoch: The number of batches per epoch :param fast_tasks: gpflow monitor fast tasks e.g. MonitorTaskGroup([ScalarToTensorBoard(log_dir, training_loss, "elbo")]) :param logging_epoch_freq: The epoch frequency that the training loss is printed. """ optimizer = tf.optimizers.Adam() monitor = Monitor(fast_tasks) @tf.function def monitored_tf_opt_step(epoch): optimizer.minimize(training_loss, model.trainable_variables) monitor(epoch) # t = time.time() epochs = tf.constant(epochs) # needs to be tf.const for epoch in tf.range(epochs): for _ in range(num_batches_per_epoch): monitored_tf_opt_step(epoch) epoch_id = epoch + 1 if epoch_id % logging_epoch_freq == 0: tf.print(f"Epoch {epoch_id}: ELBO (train) {training_loss()}") if manager is not None: manager.save()
def test_scipy_monitor_called(model): task = DummyTask() monitor = Monitor(MonitorTaskGroup(task, period=1)) opt = gpflow.optimizers.Scipy() opt.minimize(model.training_loss, model.trainable_variables, step_callback=monitor) assert task.current_step > 1
output_logdir = enumerated_logdir() model_task = ModelToTensorBoard(output_logdir, model) elbo_task = ScalarToTensorBoard(output_logdir, elbo_cb, "elbo") print_task = ExecuteCallback(callback=print_cb) # We group these tasks and specify a period of `100` steps for them fast_tasks = MonitorTaskGroup([model_task, elbo_task, print_task], period=100) # We also want to see the model's fit during the optimisation image_task = ImageToTensorBoard(output_logdir, plot_model, "samples_image") # We typically don't want to plot too frequently during optimisation, # which is why we specify a larger period for this task. slow_taks = MonitorTaskGroup(image_task, period=500) monitor = Monitor(fast_tasks, slow_taks) def monitored_training_loop(epochs: int): tf_optimization_step = tf.function(optimization_step) batches = iter(train_dataset) for epoch in range(epochs): for _ in range(ci_niter(num_batches_per_epoch)): batch = next(batches) tf_optimization_step(model, batch) epoch_id = epoch + 1 monitor(epoch, epoch_id=epoch_id, data=data)
# %% [markdown] # We now group the tasks in a set of fast and slow tasks and pass them to the monitor. # This allows us to execute the groups at a different frequency. # %% # Plotting tasks can be quite slow. We want to run them less frequently. # We group them in a `MonitorTaskGroup` and set the period to 5. slow_tasks = MonitorTaskGroup(image_task, period=5) # The other tasks are fast. We run them at each iteration of the optimisation. fast_tasks = MonitorTaskGroup([model_task, lml_task], period=1) # Both groups are passed to the monitor. # `slow_tasks` will be run five times less frequently than `fast_tasks`. monitor = Monitor(fast_tasks, slow_tasks) # %% training_loss = model.training_loss_closure( compile=True) # compile=True (default): compiles using tf.function opt = tf.optimizers.Adam() for step in range(optimisation_steps): opt.minimize(training_loss, model.trainable_variables) monitor(step) # <-- run the monitoring # %% [markdown] # TensorBoard is accessible through the browser, after launching the server by running `tensorboard --logdir ${logdir}`. # See the [TensorFlow documentation on TensorBoard](https://www.tensorflow.org/tensorboard/get_started) for more information. # %% [markdown]
def configure_tensorboard_monitor(self, scalar_period, imgs_period, nb_images=1, do_phase_space=None): if do_phase_space is None: do_phase_space = self.model.phase_space_dim == 2 if self.experiment.tensorboard_dir is None or scalar_period < 1: return None def create_bloss_tasks(directory): bloss_names = [ '-ly', '-lx', 'penalty_term', 'alpha_term', '-H', '+KL' ] bloss_tasks = [] def create_lambda(i): return lambda train_bloss=None, **kwargs: train_bloss[i] for i, name in enumerate(bloss_names): bloss_tasks.append( ScalarToTensorBoard(directory, create_lambda(i), 'bloss/' + name)) return bloss_tasks train_dir = os.path.join(self.experiment.tensorboard_dir, 'train') test_dir = os.path.join(self.experiment.tensorboard_dir, 'test') # diff_task = ModelToTensorBoard(train_dir, self.model.sde_model.diffusion) # drift_task = ModelToTensorBoard(train_dir, self.model.sde_model.drift_svgp) diff_task = [] drift_task = [] train_loss = ScalarToTensorBoard( train_dir, lambda train_loss=None, **kwargs: train_loss, 'loss') test_loss = ScalarToTensorBoard( test_dir, lambda epoch=None, kl_scheduler=None, **kwargs: self.test_loss( epoch, kl_scheduler), 'loss') train_bloss_list = create_bloss_tasks(train_dir) # train_bloss_list = [] # TODO: remove or add generator = self.experiment.test_dataset if self.experiment.has_test else self.experiment.train_dataset y_inputs = [] y_targets = [] for y in generator.take(1): for y_input, y_target in self.tbptt_chunks_generator(y): break # y_inputs.append(y_input) # y_targets.append(y_target) # y_input = tf.concat(y_inputs, axis=1) # y_target = tf.concat(y_targets, axis=1) def calc_drift_error(**kwargs): samples, entropies, encoded_dist, q0_stats, states = draw_fast_samples( self.model, None, y_input) fx, var_fx = self.model.sde_model.drift_svgp.predict_f( tf.reshape(samples, (-1, samples.shape[-1]))) fx = tf.reshape(fx, samples.shape) return tf.reduce_mean( tf.square(samples[..., 1:, :] - samples[..., :-1, :] - fx[..., :-1, :])) drift_error = ScalarToTensorBoard(train_dir, calc_drift_error, 'drift_error') beta_alpha = ScalarToTensorBoard( train_dir, lambda **kwargs: tf.reduce_mean( self.model.sde_model.diffusion.expected_diffusion()), 'beta_div_alpha') if imgs_period > 0: print('Creating image callbacks') images_dir = os.path.join(self.experiment.tensorboard_dir, 'images') nrows = 2 if self.model.phase_space_dim > 3 else 1 encoded_samples = ImageToTensorBoard( images_dir, lambda f, a: plot_encoded_samples(f, a, self.model, y_input), 'encoded_samples', fig_kw={'figsize': (12, 12)}, subplots_kw={ 'nrows': nrows, 'ncols': np.ceil(5 / 2).astype(int) }) def plot_synth(fig, axes): plot_synthetic_samples(fig, axes, self.model, y_input, y_target, simulation_steps=y.shape[-2]) nrows = 2 if do_phase_space else 1 synthetic_samples = ImageToTensorBoard( images_dir, plot_synth, 'synthetic_samples', fig_kw={'figsize': (12, 12)}, subplots_kw={ 'nrows': nrows, 'ncols': nb_images }) def plot_dec(fig, axes): plot_decoder(fig, axes, self.model, y_input, y_target) nrows = 2 if self.experiment.batch_size > 1 else 1 dec_images = ImageToTensorBoard( images_dir, plot_dec, 'decoder', fig_kw={'figsize': (12, 12)}, subplots_kw={ 'nrows': nrows, 'ncols': min(self.experiment.batch_size // nrows, 2) }) drift_images = ImageToTensorBoard( images_dir, lambda fig, axes: plot_drift_predictions( fig, axes, self.model, y_input), 'drift', fig_kw={'figsize': (12, 12)}, subplots_kw={ 'nrows': nrows, 'ncols': self.model.sde_model.dimension }) monitor = Monitor( MonitorTaskGroup([train_loss, test_loss] + train_bloss_list, period=scalar_period), # MonitorTaskGroup([drift_error, beta_alpha], period=scalar_period), MonitorTaskGroup([ synthetic_samples, dec_images, encoded_samples, drift_images ], period=imgs_period)) print('done') else: monitor = Monitor( MonitorTaskGroup([train_loss, test_loss] + train_bloss_list, period=scalar_period), MonitorTaskGroup([drift_error, beta_alpha], period=scalar_period), ) return monitor