Exemple #1
0
    def train_bc(self, expert_dataset_iter):
        """Performs a single training step of behavior clonning.

    The method optimizes MLE on the expert dataset.

    Args:
      expert_dataset_iter: An tensorflow graph iteratable object.
    """

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self.actor.variables)
            states, actions, _ = next(expert_dataset_iter)
            log_probs = self.actor.get_log_prob(states, actions)
            actor_loss = tf.reduce_mean(
                -log_probs) + keras_utils.orthogonal_regularization(
                    self.actor.trunk)

        actor_grads = tape.gradient(actor_loss, self.actor.variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grads, self.actor.variables))

        self.avg_actor_loss(actor_loss)

        if tf.equal(self.actor_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train bc/actor_loss',
                              self.avg_actor_loss.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_loss)
Exemple #2
0
    def fit_actor(self, states, target_entropy):
        """Updates actor parameters.

    Args:
      states: A batch of states.
      target_entropy: Target entropy value for alpha.

    Returns:
      Actor and alpha losses.
    """
        is_non_absorbing_mask = tf.cast(tf.equal(states[:, -1:], 0.0),
                                        tf.float32)

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self.actor.variables)
            _, actions, log_probs = self.actor(states)
            q1, q2 = self.critic(states, actions)
            q = tf.minimum(q1, q2)
            actor_loss = tf.reduce_sum(
                is_non_absorbing_mask * (self.alpha * log_probs - q)) / (
                    tf.reduce_sum(is_non_absorbing_mask) + EPS)

            actor_loss += keras_utils.orthogonal_regularization(
                self.actor.trunk)

        actor_grads = tape.gradient(actor_loss, self.actor.variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grads, self.actor.variables))

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch([self.log_alpha])
            alpha_loss = tf.reduce_sum(
                is_non_absorbing_mask * self.alpha *
                (-log_probs - target_entropy)) / (
                    tf.reduce_sum(is_non_absorbing_mask) + EPS)

        if self.learn_alpha:
            alpha_grads = tape.gradient(alpha_loss, [self.log_alpha])
            self.alpha_optimizer.apply_gradients(
                zip(alpha_grads, [self.log_alpha]))

        return actor_loss, alpha_loss, -log_probs
    def update(self,
               expert_dataset_iter,
               policy_dataset_iter,
               discount,
               replay_regularization=0.05,
               nu_reg=10.0):
        """A function that updates nu network.

    When replay regularization is non-zero, it learns
    (d_pi * (1 - replay_regularization) + d_rb * replay_regulazation) /
    (d_expert * (1 - replay_regularization) + d_rb * replay_regulazation)
    instead.

    Args:
      expert_dataset_iter: An tensorflow graph iteratable over expert data.
      policy_dataset_iter: An tensorflow graph iteratable over training policy
        data, used for regularization.
      discount: An MDP discount.
      replay_regularization: A fraction of samples to add from a replay buffer.
      nu_reg: A grad penalty regularization coefficient.
    """

        (expert_states, expert_actions,
         expert_next_states) = expert_dataset_iter.get_next()

        expert_initial_states = expert_states

        rb_states, rb_actions, rb_next_states, _, _ = policy_dataset_iter.get_next(
        )[0]

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch(self.actor.variables)
            tape.watch(self.nu_net.variables)

            _, policy_next_actions, _ = self.actor(expert_next_states)
            _, rb_next_actions, rb_log_prob = self.actor(rb_next_states)

            _, policy_initial_actions, _ = self.actor(expert_initial_states)

            # Inputs for the linear part of DualDICE loss.
            expert_init_inputs = tf.concat(
                [expert_initial_states, policy_initial_actions], 1)

            expert_inputs = tf.concat([expert_states, expert_actions], 1)
            expert_next_inputs = tf.concat(
                [expert_next_states, policy_next_actions], 1)

            rb_inputs = tf.concat([rb_states, rb_actions], 1)
            rb_next_inputs = tf.concat([rb_next_states, rb_next_actions], 1)

            expert_nu_0 = self.nu_net(expert_init_inputs)
            expert_nu = self.nu_net(expert_inputs)
            expert_nu_next = self.nu_net(expert_next_inputs)

            rb_nu = self.nu_net(rb_inputs)
            rb_nu_next = self.nu_net(rb_next_inputs)

            expert_diff = expert_nu - discount * expert_nu_next
            rb_diff = rb_nu - discount * rb_nu_next

            linear_loss_expert = tf.reduce_mean(expert_nu_0 * (1 - discount))

            linear_loss_rb = tf.reduce_mean(rb_diff)

            rb_expert_diff = tf.concat([expert_diff, rb_diff], 0)
            rb_expert_weights = tf.concat([
                tf.ones(expert_diff.shape) * (1 - replay_regularization),
                tf.ones(rb_diff.shape) * replay_regularization
            ], 0)

            rb_expert_weights /= tf.reduce_sum(rb_expert_weights)
            non_linear_loss = tf.reduce_sum(
                tf.stop_gradient(
                    weighted_softmax(rb_expert_diff, rb_expert_weights,
                                     axis=0)) * rb_expert_diff)

            linear_loss = (linear_loss_expert * (1 - replay_regularization) +
                           linear_loss_rb * replay_regularization)

            loss = (non_linear_loss - linear_loss)

            alpha = tf.random.uniform(shape=(expert_inputs.shape[0], 1))

            nu_inter = alpha * expert_inputs + (1 - alpha) * rb_inputs
            nu_next_inter = alpha * expert_next_inputs + (
                1 - alpha) * rb_next_inputs

            nu_inter = tf.concat([nu_inter, nu_next_inter], 0)

            with tf.GradientTape(watch_accessed_variables=False) as tape2:
                tape2.watch(nu_inter)
                nu_output = self.nu_net(nu_inter)
            nu_grad = tape2.gradient(nu_output, [nu_inter])[0] + EPS
            nu_grad_penalty = tf.reduce_mean(
                tf.square(tf.norm(nu_grad, axis=-1, keepdims=True) - 1))

            nu_loss = loss + nu_grad_penalty * nu_reg
            pi_loss = -loss + keras_utils.orthogonal_regularization(
                self.actor.trunk)

        nu_grads = tape.gradient(nu_loss, self.nu_net.variables)
        pi_grads = tape.gradient(pi_loss, self.actor.variables)

        self.nu_optimizer.apply_gradients(zip(nu_grads, self.nu_net.variables))
        self.actor_optimizer.apply_gradients(
            zip(pi_grads, self.actor.variables))

        del tape

        self.avg_nu_expert(expert_nu)
        self.avg_nu_rb(rb_nu)

        self.nu_reg_metric(nu_grad_penalty)
        self.avg_loss(loss)

        self.avg_actor_loss(pi_loss)
        self.avg_actor_entropy(-rb_log_prob)

        if tf.equal(self.nu_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train dual dice/loss',
                              self.avg_loss.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_loss)

            tf.summary.scalar('train dual dice/nu expert',
                              self.avg_nu_expert.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_nu_expert)

            tf.summary.scalar('train dual dice/nu rb',
                              self.avg_nu_rb.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_nu_rb)

            tf.summary.scalar('train dual dice/nu reg',
                              self.nu_reg_metric.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.nu_reg_metric)

        if tf.equal(self.actor_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train sac/actor_loss',
                              self.avg_actor_loss.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_loss)

            tf.summary.scalar('train sac/actor entropy',
                              self.avg_actor_entropy.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_entropy)