Esempio n. 1
0
    def perform(self, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      observ: Tensor of a batch of observations for all algorithms.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            network = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                    self._last_state)
            action = tf.cond(self._is_training, network.policy.sample,
                             lambda: network.mean)
            logprob = network.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', network.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            with tf.control_dependencies([
                    utility.assign_nested_vars(self._last_state,
                                               network.state),
                    self._last_action.assign(action[:, 0]),
                    self._last_mean.assign(network.mean[:, 0]),
                    self._last_logstd.assign(network.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Esempio n. 3
0
    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])
Esempio n. 4
0
    def summary(self):
        """Summary string of mean and standard deviation.

    Returns:
      Summary tensor.
    """
        with tf.name_scope(self._name + '/summary'):
            mean_summary = tf.cond(self._count > 0,
                                   lambda: self._summary('mean', self._mean),
                                   str)
            std_summary = tf.cond(self._count > 1,
                                  lambda: self._summary('stddev', self._std()),
                                  str)
            return tf.summary.merge([mean_summary, std_summary])
Esempio n. 5
0
    def update(self, value):
        """Update the mean and variance estimates.

    Args:
      value: Batch or single value tensor.

    Returns:
      Summary tensor.
    """
        with tf.name_scope(self._name + '/update'):
            if value.shape.ndims == self._mean.shape.ndims:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            count = tf.shape(value)[0]
            with tf.control_dependencies([self._count.assign_add(count)]):
                step = tf.cast(self._count, tf.float32)
                mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
                new_mean = self._mean + mean_delta / step
                new_mean = tf.cond(self._count > 1, lambda: new_mean,
                                   lambda: value[0])
                var_delta = (value - self._mean[None, ...]) * (
                    value - new_mean[None, ...])
                new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
            with tf.control_dependencies([new_mean, new_var_sum]):
                update = self._mean.assign(new_mean), self._var_sum.assign(
                    new_var_sum)
            with tf.control_dependencies(update):
                if value.shape.ndims == 1:
                    value = tf.reduce_mean(value)
                return self._summary('value', tf.reduce_mean(value))
Esempio n. 6
0
    def transform(self, value):
        """Normalize a single or batch tensor.

    Applies the activated transformations in the constructor using current
    estimates of mean and variance.

    Args:
      value: Batch or single value tensor.

    Returns:
      Normalized batch or single value tensor.
    """
        with tf.name_scope(self._name + '/transform'):
            no_batch_dim = value.shape.ndims == self._mean.shape.ndims
            if no_batch_dim:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            if self._center:
                value -= self._mean[None, ...]
            if self._scale:
                # We cannot scale before seeing at least two samples.
                value /= tf.cond(self._count > 1, lambda: self._std() + 1e-8,
                                 lambda: tf.ones_like(self._var_sum))[None]
            if self._clip:
                value = tf.clip_by_value(value, -self._clip, self._clip)
            # Remove batch dimension if necessary.
            if no_batch_dim:
                value = value[0]
            return tf.check_numerics(value, 'value')
 def _define_experience(self, agent_indices, observ, action, reward):
     """Implement the branch of experience() entered during training."""
     update_filters = tf.summary.merge([
         self._observ_filter.update(observ),
         self._reward_filter.update(reward)
     ])
     with tf.control_dependencies([update_filters]):
         if self._config.train_on_agent_action:
             # NOTE: Doesn't seem to change much.
             action = self._last_action
         batch = (observ, action, tf.gather(self._last_mean, agent_indices),
                  tf.gather(self._last_logstd, agent_indices), reward)
         append = self._episodes.append(batch, agent_indices)
     with tf.control_dependencies([append]):
         norm_observ = self._observ_filter.transform(observ)
         norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
         # pylint: disable=g-long-lambda
         summary = tf.cond(
             self._should_log, lambda: tf.summary.merge([
                 update_filters,
                 self._observ_filter.summary(),
                 self._reward_filter.summary(),
                 tf.summary.scalar('memory_size', self._memory_index),
                 tf.summary.histogram('normalized_observ', norm_observ),
                 tf.summary.histogram('action', self._last_action),
                 tf.summary.scalar('normalized_reward', norm_reward)
             ]), str)
         return summary
    def experience(self, agent_indices, observ, action, reward, unused_done,
                   unused_nextob):
        """Process the transition tuple of the current step.

    When training, add the current transition tuple to the memory and update
    the streaming statistics for observations and rewards. A summary string is
    returned if requested at this step.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Batch tensor of observations.
      action: Batch tensor of actions.
      reward: Batch tensor of rewards.
      unused_done: Batch tensor of done flags.
      unused_nextob: Batch tensor of successor observations.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('experience/'):
            return tf.cond(
                self._is_training,
                # pylint: disable=g-long-lambda
                lambda: self._define_experience(agent_indices, observ, action,
                                                reward),
                str)
    def _adjust_penalty(self, observ, old_mean, old_logstd, length):
        """Adjust the KL policy between the behavioral and current policy.

    Compute how much the policy actually changed during the multiple
    update steps. Adjust the penalty strength for the next training phase if we
    overshot or undershot the target divergence too much.

    Args:
      observ: Sequences of observations.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('adjust_penalty'):
            network = self._network(observ, length)
            assert_change = tf.assert_equal(tf.reduce_all(
                tf.equal(network.mean, old_mean)),
                                            False,
                                            message='policy should change')
            print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
            with tf.control_dependencies([assert_change, print_penalty]):
                kl_change = tf.reduce_mean(
                    self._mask(
                        utility.diag_normal_kl(old_mean, old_logstd,
                                               network.mean, network.logstd),
                        length))
                kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
                maybe_increase = tf.cond(
                    kl_change > 1.3 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty * 1.5),
                                     [0], 'increase penalty '),
                    float)
                maybe_decrease = tf.cond(
                    kl_change < 0.7 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty / 1.5),
                                     [0], 'decrease penalty '),
                    float)
            with tf.control_dependencies([maybe_increase, maybe_decrease]):
                return tf.summary.merge([
                    tf.summary.scalar('kl_change', kl_change),
                    tf.summary.scalar('penalty', self._penalty)
                ])
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
 def _define_end_episode(self, agent_indices):
     """Implement the branch of end_episode() entered during training."""
     episodes, length = self._episodes.data(agent_indices)
     space_left = self._config.update_every - self._memory_index
     use_episodes = tf.range(
         tf.minimum(tf.shape(agent_indices)[0], space_left))
     episodes = [tf.gather(elem, use_episodes) for elem in episodes]
     append = self._memory.replace(episodes,
                                   tf.gather(length, use_episodes),
                                   use_episodes + self._memory_index)
     with tf.control_dependencies([append]):
         inc_index = self._memory_index.assign_add(
             tf.shape(use_episodes)[0])
     with tf.control_dependencies([inc_index]):
         memory_full = self._memory_index >= self._config.update_every
         return tf.cond(memory_full, self._training, str)
Esempio n. 12
0
    def _std(self):
        """Computes the current estimate of the standard deviation.

    Note that the standard deviation is not defined until at least two samples
    were seen.

    Returns:
      Tensor of current variance.
    """
        variance = tf.cond(
            self._count > 1,
            lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
            lambda: tf.ones_like(self._var_sum) * float('nan'))
        # The epsilon corrects for small negative variance values caused by
        # the algorithm. It was empirically chosen to work with all environments
        # tested.
        return tf.sqrt(variance + 1e-4)
    def end_episode(self, agent_indices):
        """Add episodes to the memory and perform update steps if memory is full.

    During training, add the collected episodes of the batch indices that
    finished their episode to the memory. If the memory is full, train on it,
    and then clear the memory. A summary string is returned if requested at
    this step.

    Args:
      agent_indices: Tensor containing current batch indices.

    Returns:
       Summary tensor.
    """
        with tf.name_scope('end_episode/'):
            return tf.cond(self._is_training,
                           lambda: self._define_end_episode(agent_indices),
                           str)
Esempio n. 14
0
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary
    def _policy_loss(self, mean, logstd, old_mean, old_logstd, action,
                     advantage, length):
        """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
       policy at the beginning of training.
    2. The second term is a KL penalty between the policy at the beginning of
       training and the current policy.
    3. Additionally, if this KL already changed more than twice the target
       amount, we activate a strong penalty discouraging further divergence.

    Args:
      mean: Sequences of action means of the current policy.
      logstd: Sequences of action log stddevs of the current policy.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      action: Sequences of actions.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('policy_loss'):
            entropy = utility.diag_normal_entropy(mean, logstd)
            kl = tf.reduce_mean(
                self._mask(
                    utility.diag_normal_kl(old_mean, old_logstd, mean, logstd),
                    length), 1)
            policy_gradient = tf.exp(
                utility.diag_normal_logpdf(mean, logstd, action) -
                utility.diag_normal_logpdf(old_mean, old_logstd, action))
            surrogate_loss = -tf.reduce_mean(
                self._mask(policy_gradient * tf.stop_gradient(advantage),
                           length), 1)
            kl_penalty = self._penalty * kl
            cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
            cutoff_count = tf.reduce_sum(
                tf.cast(kl > cutoff_threshold, tf.int32))
            with tf.control_dependencies([
                    tf.cond(cutoff_count > 0,
                            lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '),
                            int)
            ]):
                kl_cutoff = (self._config.kl_cutoff_coef *
                             tf.cast(kl > cutoff_threshold, tf.float32) *
                             (kl - cutoff_threshold)**2)
            policy_loss = surrogate_loss + kl_penalty + kl_cutoff
            summary = tf.summary.merge([
                tf.summary.histogram('entropy', entropy),
                tf.summary.histogram('kl', kl),
                tf.summary.histogram('surrogate_loss', surrogate_loss),
                tf.summary.histogram('kl_penalty', kl_penalty),
                tf.summary.histogram('kl_cutoff', kl_cutoff),
                tf.summary.histogram('kl_penalty_combined',
                                     kl_penalty + kl_cutoff),
                tf.summary.histogram('policy_loss', policy_loss),
                tf.summary.scalar('avg_surr_loss',
                                  tf.reduce_mean(surrogate_loss)),
                tf.summary.scalar('avg_kl_penalty',
                                  tf.reduce_mean(kl_penalty)),
                tf.summary.scalar('avg_policy_loss',
                                  tf.reduce_mean(policy_loss))
            ])
            policy_loss = tf.reduce_mean(policy_loss, 0)
            return tf.check_numerics(policy_loss, 'policy_loss'), summary