コード例 #1
0
ファイル: train.py プロジェクト: HumanCompatibleAI/deep-rlsp
def get_learning_rate(initial_learning_rate, decay_steps, decay_rate):
    global_step = tf.Variable(0, trainable=False)
    if decay_rate == 1:
        learning_rate = tf.convert_to_tensor(initial_learning_rate)
    else:
        check_between("decay_rate", decay_rate, 0, 1)
        check_greater_equal("decay_steps", decay_steps, 1)
        learning_rate = tf.train.exponential_decay(
            initial_learning_rate,
            global_step,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
        )
    return learning_rate, global_step
コード例 #2
0
    def learn(
        self,
        n_rollouts=None,
        n_epochs=1,
        batch_size=16,
        print_evaluation=False,
        data=None,
        return_initial_loss=False,
        verbose=True,
    ):
        """
        Main training loop
        """
        experience_replay = self.experience_replay

        check_greater_equal("n_epochs", n_epochs, 1)
        check_greater_equal("batch_size", batch_size, 1)
        if data is None and experience_replay is None:
            check_greater_equal("n_rollouts", n_rollouts, batch_size)

        assert data is None or experience_replay is None
        assert (not print_evaluation) or (data is not None)

        if data is not None:
            if "states" in data and "actions" in data and "next_states" in data:
                data = (data["states"], data["actions"], data["next_states"])
                data = encode_transition_data(self.latent_space, data)
            else:
                data = extract_play_data(self.latent_space, data)
                states, actions, next_states = data
                min_val = min(np.min(states), np.min(next_states))
                max_val = max(np.max(states), np.max(next_states))
                self.play_data_range = (float(min_val), float(max_val))
                print("Playdata range of features: [{}, {}]".format(
                    min_val, max_val))
        elif experience_replay is None:
            data = self._collect_data(n_rollouts)

        if data is not None:
            data = shuffle_data(data)
            n_samples = len(data[0])
        else:
            assert experience_replay is not None
            n_samples = len(experience_replay)

        check_less_equal("batch_size", batch_size, n_samples)
        n_batches = n_samples // batch_size

        if self.sess is None:
            self.sess = get_tf_session()
            self.sess.run(tf.global_variables_initializer())

        if self.tensorboard_log is not None:
            summaries_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(self.tensorboard_log,
                                                   self.sess.graph)
        else:
            summaries_op = tf.no_op()

        first_epoch_losses = []
        last_epoch_losses = []

        for epoch in range(n_epochs):
            for batch in range(n_batches):
                if data is not None:
                    batch_states, batch_actions, batch_next_states = get_batch(
                        data, batch, batch_size)
                else:
                    (
                        batch_states,
                        batch_actions,
                        batch_next_states,
                    ) = experience_replay.sample(batch_size, normalize=True)

                if self.backward:
                    batch_in_states = batch_next_states
                    batch_out_states = batch_states
                else:
                    batch_in_states = batch_states
                    batch_out_states = batch_next_states

                (
                    batch_loss,
                    _,
                    batch_lr,
                    summary,
                    step,
                    mixture_entropy,
                ) = self.sess.run(
                    [
                        self.loss,
                        self.optimization_op,
                        self.learning_rate,
                        summaries_op,
                        self.global_step,
                        self.mixture_entropy,
                    ],
                    feed_dict={
                        self.in_state: batch_in_states,
                        self.in_action: batch_actions,
                        self.out_state: batch_out_states,
                    },
                )

                if epoch == 0:
                    first_epoch_losses.append(batch_loss)
                if epoch == n_epochs - 1:
                    last_epoch_losses.append(batch_loss)

                if self.tensorboard_log is not None:
                    summary_writer.add_summary(summary, step)

                if verbose:
                    print(
                        "Epoch: {}/{}...".format(epoch + 1, n_epochs),
                        "Batch: {}/{}...".format(batch + 1, n_batches),
                        "Training loss: {:.4f}   (ent {:.4f})  ".format(
                            batch_loss, mixture_entropy),
                        "(learning_rate = {:.6f})".format(batch_lr),
                    )

            if self.checkpoint_folder is not None:
                params = {
                    "hidden_layer_size":
                    self._hidden_layer_size,
                    "n_hidden_layers":
                    self._n_hidden_layers,
                    "learning_rate":
                    float(self.learning_rate.eval(session=self.sess)),
                    "n_out_states":
                    self.n_out_states,
                    "gauss_stdev":
                    self.gauss_stdev,
                    "play_data_range":
                    self.play_data_range,
                }
                with open("_".join([self.checkpoint_folder, "params.json"]),
                          "w") as f:
                    json.dump(params, f)

                self.saver.save(self.sess, self.checkpoint_folder)

        if print_evaluation:
            self._print_evaluation(data)

        if return_initial_loss:
            return np.mean(first_epoch_losses), np.mean(last_epoch_losses)
        return np.mean(last_epoch_losses)
コード例 #3
0
    def __init__(
        self,
        env,
        tensorboard_log=None,
        checkpoint_folder=None,
        observation_dist="gaussian",
        pixel_observations=False,
        hidden_layer_size=200,
        n_hidden_layers=2,
        rnn_state_size=30,
        learning_rate=3e-4,
        obs_stddev=0.01,
        likelihood_scale=1,
        mujoco_video_out_path="mujoco",
        timesteps_training=20,
        fixed_latent_stddev=None,
    ):
        assert isinstance(env.action_space, gym.spaces.Box)
        self.env = env
        self.action_space_shape = list(self.env.action_space.shape)

        self.pixel_observations = pixel_observations
        self.observation_dist = observation_dist

        if self.pixel_observations:
            self.data_shape = [64, 64, 3]
            self._obs_stddev = 1
        else:
            self.data_shape = list(self.env.observation_space.shape)

        self._obs_stddev = obs_stddev
        self._rnn_state_size = rnn_state_size
        self._num_layers = n_hidden_layers
        self._hidden_layer_size = hidden_layer_size
        self._min_stddev = 0.01
        self.likelihood_scale = 1  # 1e-2
        self._fixed_latent_stddev = fixed_latent_stddev

        self.state_size = 3 * self._rnn_state_size

        self.mujoco_video_out_path = mujoco_video_out_path
        self.timesteps = timesteps_training
        self.N_samples = 3
        self.N_samples_for_gradient = 1
        check_greater_equal("N_samples", self.N_samples,
                            self.N_samples_for_gradient)

        if self.pixel_observations and self.observation_dist != "gaussian":
            raise ValueError(
                'Pixel observations require the observation model to be "gaussian"'
            )

        self.time_axis = tf.convert_to_tensor([1])

        self._define_input_placeholders()
        self._define_model()

        self.loss = self._define_loss()
        self.learning_rate, self.global_step = get_learning_rate(
            learning_rate, None, 1)

        self.optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                                epsilon=1e-4)
        self.gradients = self.optimizer.compute_gradients(loss=self.loss)
        self.gradients = [(tf.clip_by_norm(grad, 1000), var)
                          for grad, var in self.gradients]
        self.optimization_op = self.optimizer.apply_gradients(
            self.gradients, global_step=self.global_step)

        self.tensorboard_log = tensorboard_log
        if self.tensorboard_log is not None:
            self._define_tensorboard_metrics()

        self.checkpoint_folder = checkpoint_folder
        if self.checkpoint_folder is not None:
            self.saver = tf.train.Saver()

        self.policy = None
        self.inverse_policy = None
        self.sess = None
コード例 #4
0
    def learn(
        self,
        n_rollouts=10000,
        n_epochs=15,
        batch_size=1,
        print_evaluation=False,
        play_data=None,
        return_initial_loss=False,
    ):
        """
        Main training loop
        """
        check_greater_equal("n_epochs", n_epochs, 1)
        check_greater_equal("batch_size", batch_size, 1)
        check_greater_equal("n_rollouts", n_rollouts, batch_size)

        use_play_data = play_data is not None
        if use_play_data:
            data = extract_play_data(play_data, self.timesteps)
        else:
            data = self._collect_data(n_rollouts)
        n_samples = len(data[0])
        assert n_samples == len(data[1])

        check_less_equal("batch_size", batch_size, n_samples)
        n_batches = n_samples // batch_size

        if self.sess is None:
            self.sess = get_tf_session()
            self.sess.run(tf.global_variables_initializer())

        if self.tensorboard_log is not None:
            summaries_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(self.tensorboard_log,
                                                   self.sess.graph)
        else:
            summaries_op = tf.no_op()

        first_epoch_losses = []
        last_epoch_losses = []

        for epoch in range(n_epochs):
            for batch in range(n_batches):
                batch_observations, batch_actions = get_batch(
                    data, batch, batch_size, self.timesteps)

                batch_loss, batch_elbo, _, batch_lr, summary, step = self.sess.run(
                    [
                        self.loss,
                        self.elbo,
                        self.optimization_op,
                        self.learning_rate,
                        summaries_op,
                        self.global_step,
                    ],
                    feed_dict={
                        self.in_obs_seq: batch_observations,
                        self.in_actions: batch_actions,
                        self.horizon: self.timesteps + 1,
                    },
                )

                if epoch == 0:
                    first_epoch_losses.append(batch_loss)
                if epoch == n_epochs - 1:
                    last_epoch_losses.append(batch_loss)

                if self.tensorboard_log is not None:
                    if (step - 1) % 110 == 0:
                        if use_play_data:
                            self._create_mujoco_vids(self.env,
                                                     batch_observations,
                                                     batch_actions, step)
                    summary_writer.add_summary(summary, step)

                print(
                    "Epoch: {}/{}...".format(epoch + 1, n_epochs),
                    "Batch: {}/{}...".format(batch + 1, n_batches),
                    "Training loss: {:.4f}   (learning_rate = {:.6f})".format(
                        batch_loss, batch_lr),
                    flush=True,
                )

            if self.checkpoint_folder is not None:
                params = {
                    "observation_dist":
                    self.observation_dist,
                    "pixel_observations":
                    self.pixel_observations,
                    "hidden_layer_size":
                    int(self._hidden_layer_size),
                    "n_hidden_layers":
                    int(self._num_layers),
                    "rnn_state_size":
                    int(self._rnn_state_size),
                    "learning_rate":
                    float(self.learning_rate.eval(session=self.sess)),
                    "obs_stddev":
                    float(self._obs_stddev),
                    "likelihood_scale":
                    float(self.likelihood_scale),
                    "mujoco_video_out_path":
                    self.mujoco_video_out_path,
                    "timesteps_training":
                    int(self.timesteps),
                    "fixed_latent_stddev":
                    float(self._fixed_latent_stddev)
                    if self._fixed_latent_stddev is not None else None,
                }
                with open("_".join([self.checkpoint_folder, "params.json"]),
                          "w") as f:
                    json.dump(params, f)
                self.saver.save(self.sess, self.checkpoint_folder)

        if print_evaluation:
            if use_play_data:
                print(
                    "Warning: print_evaluation is only supported for gridworlds."
                )
            else:
                self._print_evaluation()

        if return_initial_loss:
            return np.mean(first_epoch_losses), np.mean(last_epoch_losses)
        return np.mean(last_epoch_losses)