Beispiel #1
0
    def _perform_update_steps(self, observ, action, old_policy_params, reward,
                              length):
        """Perform multiple update steps of value function and policy.

        The advantage is computed once at the beginning and shared across
        iterations. We need to decide for the summary of one iteration, and thus
        choose the one after half of the iterations.

        Args:
          observ: Sequences of observations.
          action: Sequences of actions.
          old_policy_params: Parameters of the behavioral policy.
          reward: Sequences of rewards.
          length: Batch of sequence lengths.

        Returns:
          Summary tensor.
        """
        # NOTE: the rewards of OFFENSE and of DEFENSE are opposite by multiplying -1
        reward = tf.where(self._is_optimizing_offense, reward, -reward)
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        value = tf.where(self._is_optimizing_offense, value[TEAM['OFFENSE']],
                         value[TEAM['DEFENSE']])
        if self._config.gae_lambda:  # NOTE
            advantage = utility.lambda_advantage(reward, value, length,
                                                 self._config.discount,
                                                 self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        episodes = (observ, action, old_policy_params[ACT['DECISION']],
                    old_policy_params[ACT['OFF_DASH']],
                    old_policy_params[ACT['DEF_DASH']], reward, advantage)
        value_loss, policy_loss, summary = parts.iterate_sequences(
            self._update_step, [0., 0., ''],
            episodes,
            length,
            self._config.chunk_length,
            self._config.batch_size,
            self._config.update_epochs,
            padding_value=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]
Beispiel #2
0
    def _perform_update_steps(self, observ, action, old_policy_params, reward,
                              length, expert_s, expert_a):
        """Perform multiple update steps of value function and policy.

        The advantage is computed once at the beginning and shared across
        iterations. We need to decide for the summary of one iteration, and thus
        choose the one after half of the iterations.

        Args:
          observ: Sequences of observations.
          action: Sequences of actions.
          old_policy_params: Parameters of the behavioral policy.
          reward: Sequences of rewards.
          length: Batch of sequence lengths.

        Returns:
          Summary tensor.
        """
        value = self._network(observ, length).value
        return_ = reward
        advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        episodes = (observ, action, old_policy_params[ACT['DEF_DASH']], reward,
                    advantage, expert_s, expert_a)
        value_loss, policy_loss, summary = parts.iterate_sequences(
            self._update_step, [0., 0., ''],
            episodes,
            length,
            self._config.chunk_length,
            self._config.batch_size,
            self._config.update_epochs,
            padding_value=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return tf.summary.merge([
                summary[self._config.update_epochs // 2],
                tf.summary.scalar('normed_accumulated_reward',
                                  tf.reduce_mean(return_))
            ])
Beispiel #3
0
  def _perform_update_steps(
      self, observ, action, old_policy_params, reward, length):
    """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_policy_params: Parameters of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
    return_ = utility.discounted_return(
        reward, length, self._config.discount)
    value = self._network(observ, length).value
    if self._config.gae_lambda:
      advantage = utility.lambda_advantage(
          reward, value, length, self._config.discount,
          self._config.gae_lambda)
    else:
      advantage = return_ - value
    mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
    advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
    advantage = tf.Print(
        advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
        'return and value: ')
    advantage = tf.Print(
        advantage, [tf.reduce_mean(advantage)],
        'normalized advantage: ')
    episodes = (observ, action, old_policy_params, reward, advantage)
    value_loss, policy_loss, summary = parts.iterate_sequences(
        self._update_step, [0., 0., ''], episodes, length,
        self._config.chunk_length,
        self._config.batch_size,
        self._config.update_epochs,
        padding_value=1)
    print_losses = tf.group(
        tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
        tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
    with tf.control_dependencies([value_loss, policy_loss, print_losses]):
      return summary[self._config.update_epochs // 2]