Python discounted_return 예제들, agents.algorithms.ppo.utility.discounted_return Python 예제들

예제 #1

0

파일 보기

    def _value_loss(self, observ, reward, length):
        """Compute the loss function for the value baseline.

        The value loss is the difference between empirical and approximated returns
        over the collected episodes. Returns the loss tensor and a summary strin.

        Args:
          observ: Sequences of observations.
          reward: Sequences of reward.
          length: Batch of sequence lengths.

        Returns:
          Tuple of loss tensor and summary tensor.
        """
        with tf.name_scope('value_loss'):
            value = self._network(observ, length).value
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            advantage = return_ - value
            value_loss = 0.5 * self._mask(advantage**2, length)
            summary = tf.summary.merge([
                tf.summary.histogram('value_loss', value_loss),
                tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
            ])
            value_loss = tf.reduce_mean(value_loss)
            return tf.check_numerics(value_loss, 'value_loss'), summary

예제 #2

0

파일 보기

파일: ppo.py 프로젝트: shamanez/agents

  def _value_loss(self, observ, reward, length):
    """Compute the loss function for the value baseline.

    The value loss is the difference between empirical and approximated returns
    over the collected episodes. Returns the loss tensor and a summary strin.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
    with tf.name_scope('value_loss'):
      value = self._network(observ, length).value
      return_ = utility.discounted_return(
          reward, length, self._config.discount)
      advantage = return_ - value
      value_loss = 0.5 * self._mask(advantage ** 2, length)
      summary = tf.summary.merge([
          tf.summary.histogram('value_loss', value_loss),
          tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
      value_loss = tf.reduce_mean(value_loss)
      return tf.check_numerics(value_loss, 'value_loss'), summary

예제 #3

0

파일 보기

파일: two_trunk_ppo.py 프로젝트: chychen/RL_strategies

    def _value_loss(self, observ, reward, length):
        """Compute the loss function for the value baseline.

        The value loss is the difference between empirical and approximated returns
        over the collected episodes. Returns the loss tensor and a summary strin.

        Args:
          observ: Sequences of observations.
          reward: Sequences of reward.
          length: Batch of sequence lengths.

        Returns:
          Tuple of loss tensor and summary tensor.
        """
        with tf.name_scope('value_loss'):
            value = self._network(observ, length).value
            value = tf.where(self._is_optimizing_offense,
                             value[TEAM['OFFENSE']], value[TEAM['DEFENSE']])
            # TODO calcuate either offense or defense (doesn't matter, because defensive turn always get zeros for reward, rewards are from offensive turn)
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            advantage = return_ - value
            value_loss = 0.5 * self._mask(advantage**2, length)
            summary = tf.summary.merge([
                tf.summary.histogram('value_loss', value_loss),
                tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
            ])
            value_loss = tf.reduce_mean(value_loss)
            return tf.check_numerics(value_loss, 'value_loss'), summary

예제 #4

0

파일 보기

파일: two_trunk_ppo.py 프로젝트: chychen/RL_strategies

    def _perform_update_steps(self, observ, action, old_policy_params, reward,
                              length):
        """Perform multiple update steps of value function and policy.

        The advantage is computed once at the beginning and shared across
        iterations. We need to decide for the summary of one iteration, and thus
        choose the one after half of the iterations.

        Args:
          observ: Sequences of observations.
          action: Sequences of actions.
          old_policy_params: Parameters of the behavioral policy.
          reward: Sequences of rewards.
          length: Batch of sequence lengths.

        Returns:
          Summary tensor.
        """
        # NOTE: the rewards of OFFENSE and of DEFENSE are opposite by multiplying -1
        reward = tf.where(self._is_optimizing_offense, reward, -reward)
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        value = tf.where(self._is_optimizing_offense, value[TEAM['OFFENSE']],
                         value[TEAM['DEFENSE']])
        if self._config.gae_lambda:  # NOTE
            advantage = utility.lambda_advantage(reward, value, length,
                                                 self._config.discount,
                                                 self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        episodes = (observ, action, old_policy_params[ACT['DECISION']],
                    old_policy_params[ACT['OFF_DASH']],
                    old_policy_params[ACT['DEF_DASH']], reward, advantage)
        value_loss, policy_loss, summary = parts.iterate_sequences(
            self._update_step, [0., 0., ''],
            episodes,
            length,
            self._config.chunk_length,
            self._config.batch_size,
            self._config.update_epochs,
            padding_value=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]

예제 #5

0

파일 보기

파일: ppo.py 프로젝트: shamanez/agents

  def _perform_update_steps(
      self, observ, action, old_policy_params, reward, length):
    """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_policy_params: Parameters of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
    return_ = utility.discounted_return(
        reward, length, self._config.discount)
    value = self._network(observ, length).value
    if self._config.gae_lambda:
      advantage = utility.lambda_advantage(
          reward, value, length, self._config.discount,
          self._config.gae_lambda)
    else:
      advantage = return_ - value
    mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
    advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
    advantage = tf.Print(
        advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
        'return and value: ')
    advantage = tf.Print(
        advantage, [tf.reduce_mean(advantage)],
        'normalized advantage: ')
    episodes = (observ, action, old_policy_params, reward, advantage)
    value_loss, policy_loss, summary = parts.iterate_sequences(
        self._update_step, [0., 0., ''], episodes, length,
        self._config.chunk_length,
        self._config.batch_size,
        self._config.update_epochs,
        padding_value=1)
    print_losses = tf.group(
        tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
        tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
    with tf.control_dependencies([value_loss, policy_loss, print_losses]):
      return summary[self._config.update_epochs // 2]

예제 #6

0

파일 보기

    def _perform_update_steps(self, observ, action, old_policy, reward,
                              length):
        """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_policy: Action distribution of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        if self._config.gae_lambda:
            advantage = utility.lambda_return(reward, value, length,
                                              self._config.discount,
                                              self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        # pylint: disable=g-long-lambda
        value_loss, policy_loss, summary = tf.scan(
            lambda _1, _2: self._update_step(observ, action, old_policy,
                                             reward, advantage, length),
            tf.range(self._config.update_epochs), [0., 0., ''],
            parallel_iterations=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]