def get_learning_rate(initial_learning_rate, decay_steps, decay_rate): global_step = tf.Variable(0, trainable=False) if decay_rate == 1: learning_rate = tf.convert_to_tensor(initial_learning_rate) else: check_between("decay_rate", decay_rate, 0, 1) check_greater_equal("decay_steps", decay_steps, 1) learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step, decay_steps=decay_steps, decay_rate=decay_rate, ) return learning_rate, global_step
def learn( self, n_rollouts=None, n_epochs=1, batch_size=16, print_evaluation=False, data=None, return_initial_loss=False, verbose=True, ): """ Main training loop """ experience_replay = self.experience_replay check_greater_equal("n_epochs", n_epochs, 1) check_greater_equal("batch_size", batch_size, 1) if data is None and experience_replay is None: check_greater_equal("n_rollouts", n_rollouts, batch_size) assert data is None or experience_replay is None assert (not print_evaluation) or (data is not None) if data is not None: if "states" in data and "actions" in data and "next_states" in data: data = (data["states"], data["actions"], data["next_states"]) data = encode_transition_data(self.latent_space, data) else: data = extract_play_data(self.latent_space, data) states, actions, next_states = data min_val = min(np.min(states), np.min(next_states)) max_val = max(np.max(states), np.max(next_states)) self.play_data_range = (float(min_val), float(max_val)) print("Playdata range of features: [{}, {}]".format( min_val, max_val)) elif experience_replay is None: data = self._collect_data(n_rollouts) if data is not None: data = shuffle_data(data) n_samples = len(data[0]) else: assert experience_replay is not None n_samples = len(experience_replay) check_less_equal("batch_size", batch_size, n_samples) n_batches = n_samples // batch_size if self.sess is None: self.sess = get_tf_session() self.sess.run(tf.global_variables_initializer()) if self.tensorboard_log is not None: summaries_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(self.tensorboard_log, self.sess.graph) else: summaries_op = tf.no_op() first_epoch_losses = [] last_epoch_losses = [] for epoch in range(n_epochs): for batch in range(n_batches): if data is not None: batch_states, batch_actions, batch_next_states = get_batch( data, batch, batch_size) else: ( batch_states, batch_actions, batch_next_states, ) = experience_replay.sample(batch_size, normalize=True) if self.backward: batch_in_states = batch_next_states batch_out_states = batch_states else: batch_in_states = batch_states batch_out_states = batch_next_states ( batch_loss, _, batch_lr, summary, step, mixture_entropy, ) = self.sess.run( [ self.loss, self.optimization_op, self.learning_rate, summaries_op, self.global_step, self.mixture_entropy, ], feed_dict={ self.in_state: batch_in_states, self.in_action: batch_actions, self.out_state: batch_out_states, }, ) if epoch == 0: first_epoch_losses.append(batch_loss) if epoch == n_epochs - 1: last_epoch_losses.append(batch_loss) if self.tensorboard_log is not None: summary_writer.add_summary(summary, step) if verbose: print( "Epoch: {}/{}...".format(epoch + 1, n_epochs), "Batch: {}/{}...".format(batch + 1, n_batches), "Training loss: {:.4f} (ent {:.4f}) ".format( batch_loss, mixture_entropy), "(learning_rate = {:.6f})".format(batch_lr), ) if self.checkpoint_folder is not None: params = { "hidden_layer_size": self._hidden_layer_size, "n_hidden_layers": self._n_hidden_layers, "learning_rate": float(self.learning_rate.eval(session=self.sess)), "n_out_states": self.n_out_states, "gauss_stdev": self.gauss_stdev, "play_data_range": self.play_data_range, } with open("_".join([self.checkpoint_folder, "params.json"]), "w") as f: json.dump(params, f) self.saver.save(self.sess, self.checkpoint_folder) if print_evaluation: self._print_evaluation(data) if return_initial_loss: return np.mean(first_epoch_losses), np.mean(last_epoch_losses) return np.mean(last_epoch_losses)
def __init__( self, env, tensorboard_log=None, checkpoint_folder=None, observation_dist="gaussian", pixel_observations=False, hidden_layer_size=200, n_hidden_layers=2, rnn_state_size=30, learning_rate=3e-4, obs_stddev=0.01, likelihood_scale=1, mujoco_video_out_path="mujoco", timesteps_training=20, fixed_latent_stddev=None, ): assert isinstance(env.action_space, gym.spaces.Box) self.env = env self.action_space_shape = list(self.env.action_space.shape) self.pixel_observations = pixel_observations self.observation_dist = observation_dist if self.pixel_observations: self.data_shape = [64, 64, 3] self._obs_stddev = 1 else: self.data_shape = list(self.env.observation_space.shape) self._obs_stddev = obs_stddev self._rnn_state_size = rnn_state_size self._num_layers = n_hidden_layers self._hidden_layer_size = hidden_layer_size self._min_stddev = 0.01 self.likelihood_scale = 1 # 1e-2 self._fixed_latent_stddev = fixed_latent_stddev self.state_size = 3 * self._rnn_state_size self.mujoco_video_out_path = mujoco_video_out_path self.timesteps = timesteps_training self.N_samples = 3 self.N_samples_for_gradient = 1 check_greater_equal("N_samples", self.N_samples, self.N_samples_for_gradient) if self.pixel_observations and self.observation_dist != "gaussian": raise ValueError( 'Pixel observations require the observation model to be "gaussian"' ) self.time_axis = tf.convert_to_tensor([1]) self._define_input_placeholders() self._define_model() self.loss = self._define_loss() self.learning_rate, self.global_step = get_learning_rate( learning_rate, None, 1) self.optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-4) self.gradients = self.optimizer.compute_gradients(loss=self.loss) self.gradients = [(tf.clip_by_norm(grad, 1000), var) for grad, var in self.gradients] self.optimization_op = self.optimizer.apply_gradients( self.gradients, global_step=self.global_step) self.tensorboard_log = tensorboard_log if self.tensorboard_log is not None: self._define_tensorboard_metrics() self.checkpoint_folder = checkpoint_folder if self.checkpoint_folder is not None: self.saver = tf.train.Saver() self.policy = None self.inverse_policy = None self.sess = None
def learn( self, n_rollouts=10000, n_epochs=15, batch_size=1, print_evaluation=False, play_data=None, return_initial_loss=False, ): """ Main training loop """ check_greater_equal("n_epochs", n_epochs, 1) check_greater_equal("batch_size", batch_size, 1) check_greater_equal("n_rollouts", n_rollouts, batch_size) use_play_data = play_data is not None if use_play_data: data = extract_play_data(play_data, self.timesteps) else: data = self._collect_data(n_rollouts) n_samples = len(data[0]) assert n_samples == len(data[1]) check_less_equal("batch_size", batch_size, n_samples) n_batches = n_samples // batch_size if self.sess is None: self.sess = get_tf_session() self.sess.run(tf.global_variables_initializer()) if self.tensorboard_log is not None: summaries_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(self.tensorboard_log, self.sess.graph) else: summaries_op = tf.no_op() first_epoch_losses = [] last_epoch_losses = [] for epoch in range(n_epochs): for batch in range(n_batches): batch_observations, batch_actions = get_batch( data, batch, batch_size, self.timesteps) batch_loss, batch_elbo, _, batch_lr, summary, step = self.sess.run( [ self.loss, self.elbo, self.optimization_op, self.learning_rate, summaries_op, self.global_step, ], feed_dict={ self.in_obs_seq: batch_observations, self.in_actions: batch_actions, self.horizon: self.timesteps + 1, }, ) if epoch == 0: first_epoch_losses.append(batch_loss) if epoch == n_epochs - 1: last_epoch_losses.append(batch_loss) if self.tensorboard_log is not None: if (step - 1) % 110 == 0: if use_play_data: self._create_mujoco_vids(self.env, batch_observations, batch_actions, step) summary_writer.add_summary(summary, step) print( "Epoch: {}/{}...".format(epoch + 1, n_epochs), "Batch: {}/{}...".format(batch + 1, n_batches), "Training loss: {:.4f} (learning_rate = {:.6f})".format( batch_loss, batch_lr), flush=True, ) if self.checkpoint_folder is not None: params = { "observation_dist": self.observation_dist, "pixel_observations": self.pixel_observations, "hidden_layer_size": int(self._hidden_layer_size), "n_hidden_layers": int(self._num_layers), "rnn_state_size": int(self._rnn_state_size), "learning_rate": float(self.learning_rate.eval(session=self.sess)), "obs_stddev": float(self._obs_stddev), "likelihood_scale": float(self.likelihood_scale), "mujoco_video_out_path": self.mujoco_video_out_path, "timesteps_training": int(self.timesteps), "fixed_latent_stddev": float(self._fixed_latent_stddev) if self._fixed_latent_stddev is not None else None, } with open("_".join([self.checkpoint_folder, "params.json"]), "w") as f: json.dump(params, f) self.saver.save(self.sess, self.checkpoint_folder) if print_evaluation: if use_play_data: print( "Warning: print_evaluation is only supported for gridworlds." ) else: self._print_evaluation() if return_initial_loss: return np.mean(first_epoch_losses), np.mean(last_epoch_losses) return np.mean(last_epoch_losses)