コード例 #1
0
    def _adjust_penalty(self, observ, old_mean, old_logstd, length):
        """Adjust the KL policy between the behavioral and current policy.

    Compute how much the policy actually changed during the multiple
    update steps. Adjust the penalty strength for the next training phase if we
    overshot or undershot the target divergence too much.

    Args:
      observ: Sequences of observations.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('adjust_penalty'):
            network = self._network(observ, length)
            assert_change = tf.assert_equal(tf.reduce_all(
                tf.equal(network.mean, old_mean)),
                                            False,
                                            message='policy should change')
            print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
            with tf.control_dependencies([assert_change, print_penalty]):
                kl_change = tf.reduce_mean(
                    self._mask(
                        utility.diag_normal_kl(old_mean, old_logstd,
                                               network.mean, network.logstd),
                        length))
                kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
                maybe_increase = tf.cond(
                    kl_change > 1.3 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty * 1.5),
                                     [0], 'increase penalty '),
                    float)
                maybe_decrease = tf.cond(
                    kl_change < 0.7 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty / 1.5),
                                     [0], 'decrease penalty '),
                    float)
            with tf.control_dependencies([maybe_increase, maybe_decrease]):
                return tf.summary.merge([
                    tf.summary.scalar('kl_change', kl_change),
                    tf.summary.scalar('penalty', self._penalty)
                ])
コード例 #2
0
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
コード例 #3
0
    def _update_policy(self, observ, action, old_mean, old_logstd, reward,
                       length):
        """Perform multiple update steps of the policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('update_policy'):
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            value = self._network(observ, length).value
            if self._config.gae_lambda:
                advantage = utility.lambda_return(reward, value, length,
                                                  self._config.discount,
                                                  self._config.gae_lambda)
            else:
                advantage = return_ - value
            mean, variance = tf.nn.moments(advantage,
                                           axes=[0, 1],
                                           keep_dims=True)
            advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
            advantage = tf.Print(
                advantage, [tf.reduce_mean(return_),
                            tf.reduce_mean(value)], 'return and value: ')
            advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                                 'normalized advantage: ')
            # pylint: disable=g-long-lambda
            loss, summary = tf.scan(
                lambda _1, _2: self._update_policy_step(
                    observ, action, old_mean, old_logstd, advantage, length),
                tf.range(self._config.update_epochs_policy), [0., ''],
                parallel_iterations=1)
            print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
            with tf.control_dependencies([loss, print_loss]):
                return summary[self._config.update_epochs_policy // 2]
コード例 #4
0
    def _mask(self, tensor, length):
        """Set padding elements of a batch of sequences to zero.

    Useful to then safely sum along the time dimension.

    Args:
      tensor: Tensor of sequences.
      length: Batch of sequence lengths.

    Returns:
      Masked sequences.
    """
        with tf.name_scope('mask'):
            range_ = tf.range(tensor.shape[1].value)
            mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
            masked = tensor * mask
            return tf.check_numerics(masked, 'masked')
コード例 #5
0
    def end_episode(self, agent_indices):
        """Add episodes to the memory and perform update steps if memory is full.

    During training, add the collected episodes of the batch indices that
    finished their episode to the memory. If the memory is full, train on it,
    and then clear the memory. A summary string is returned if requested at
    this step.

    Args:
      agent_indices: Tensor containing current batch indices.

    Returns:
       Summary tensor.
    """
        with tf.name_scope('end_episode/'):
            return tf.cond(self._is_training,
                           lambda: self._define_end_episode(agent_indices),
                           str)
コード例 #6
0
    def begin_episode(self, agent_indices):
        """Reset the recurrent states and stored episode.

    Args:
      agent_indices: Tensor containing current batch indices.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('begin_episode/'):
            if self._last_state is None:
                reset_state = tf.no_op()
            else:
                reset_state = utility.reinit_nested_vars(
                    self._last_state, agent_indices)
            reset_buffer = self._episodes.clear(agent_indices)
            with tf.control_dependencies([reset_state, reset_buffer]):
                return tf.constant('')
コード例 #7
0
    def experience(self, observ, action, reward, unused_done, unused_nextob):
        """Process the transition tuple of the current step.

    When training, add the current transition tuple to the memory and update
    the streaming statistics for observations and rewards. A summary string is
    returned if requested at this step.

    Args:
      observ: Batch tensor of observations.
      action: Batch tensor of actions.
      reward: Batch tensor of rewards.
      unused_done: Batch tensor of done flags.
      unused_nextob: Batch tensor of successor observations.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('experience/'):
            return tf.cond(
                self._is_training,
                lambda: self._define_experience(observ, action, reward), str)
コード例 #8
0
    def _update_value(self, observ, reward, length):
        """Perform multiple update steps of the value baseline.

    We need to decide for the summary of one iteration, and thus choose the one
    after half of the iterations.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('update_value'):
            loss, summary = tf.scan(
                lambda _1, _2: self._update_value_step(observ, reward, length),
                tf.range(self._config.update_epochs_value), [0., ''],
                parallel_iterations=1)
            print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
            with tf.control_dependencies([loss, print_loss]):
                return summary[self._config.update_epochs_value // 2]
コード例 #9
0
    def _training(self):
        """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('training'):
            assert_full = tf.assert_equal(self._memory_index,
                                          self._config.update_every)
            with tf.control_dependencies([assert_full]):
                data = self._memory.data()
            (observ, action, old_mean, old_logstd, reward), length = data
            with tf.control_dependencies([tf.assert_greater(length, 0)]):
                length = tf.identity(length)
            observ = self._observ_filter.transform(observ)
            reward = self._reward_filter.transform(reward)
            policy_summary = self._update_policy(observ, action, old_mean,
                                                 old_logstd, reward, length)
            with tf.control_dependencies([policy_summary]):
                value_summary = self._update_value(observ, reward, length)
            with tf.control_dependencies([value_summary]):
                penalty_summary = self._adjust_penalty(observ, old_mean,
                                                       old_logstd, length)
            with tf.control_dependencies([penalty_summary]):
                clear_memory = tf.group(self._memory.clear(),
                                        self._memory_index.assign(0))
            with tf.control_dependencies([clear_memory]):
                weight_summary = utility.variable_summaries(
                    tf.trainable_variables(), self._config.weight_summaries)
                return tf.summary.merge([
                    policy_summary, value_summary, penalty_summary,
                    weight_summary
                ])
コード例 #10
0
ファイル: simulate.py プロジェクト: Gs-001/quad
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary
コード例 #11
0
    def _policy_loss(self, mean, logstd, old_mean, old_logstd, action,
                     advantage, length):
        """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
       policy at the beginning of training.
    2. The second term is a KL penalty between the policy at the beginning of
       training and the current policy.
    3. Additionally, if this KL already changed more than twice the target
       amount, we activate a strong penalty discouraging further divergence.

    Args:
      mean: Sequences of action means of the current policy.
      logstd: Sequences of action log stddevs of the current policy.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      action: Sequences of actions.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('policy_loss'):
            entropy = utility.diag_normal_entropy(mean, logstd)
            kl = tf.reduce_mean(
                self._mask(
                    utility.diag_normal_kl(old_mean, old_logstd, mean, logstd),
                    length), 1)
            policy_gradient = tf.exp(
                utility.diag_normal_logpdf(mean, logstd, action) -
                utility.diag_normal_logpdf(old_mean, old_logstd, action))
            surrogate_loss = -tf.reduce_mean(
                self._mask(policy_gradient * tf.stop_gradient(advantage),
                           length), 1)
            kl_penalty = self._penalty * kl
            cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
            cutoff_count = tf.reduce_sum(
                tf.cast(kl > cutoff_threshold, tf.int32))
            with tf.control_dependencies([
                    tf.cond(cutoff_count > 0,
                            lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '),
                            int)
            ]):
                kl_cutoff = (self._config.kl_cutoff_coef *
                             tf.cast(kl > cutoff_threshold, tf.float32) *
                             (kl - cutoff_threshold)**2)
            policy_loss = surrogate_loss + kl_penalty + kl_cutoff
            summary = tf.summary.merge([
                tf.summary.histogram('entropy', entropy),
                tf.summary.histogram('kl', kl),
                tf.summary.histogram('surrogate_loss', surrogate_loss),
                tf.summary.histogram('kl_penalty', kl_penalty),
                tf.summary.histogram('kl_cutoff', kl_cutoff),
                tf.summary.histogram('kl_penalty_combined',
                                     kl_penalty + kl_cutoff),
                tf.summary.histogram('policy_loss', policy_loss),
                tf.summary.scalar('avg_surr_loss',
                                  tf.reduce_mean(surrogate_loss)),
                tf.summary.scalar('avg_kl_penalty',
                                  tf.reduce_mean(kl_penalty)),
                tf.summary.scalar('avg_policy_loss',
                                  tf.reduce_mean(policy_loss))
            ])
            policy_loss = tf.reduce_mean(policy_loss, 0)
            return tf.check_numerics(policy_loss, 'policy_loss'), summary