Example #1
0
    def _update_policy_step(self, observ, action, old_mean, old_logstd,
                            advantage, length):
        """Compute the current policy loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        network = self._network(observ, length)
        loss, summary = self._policy_loss(network.mean, network.logstd,
                                          old_mean, old_logstd, action,
                                          advantage, length)
        gradients, variables = (zip(
            *self._policy_optimizer.compute_gradients(loss)))
        optimize = self._policy_optimizer.apply_gradients(
            zip(gradients, variables))
        summary = tf.summary.merge([
            summary,
            tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
            utility.gradient_summaries(zip(gradients, variables),
                                       dict(policy=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(loss), tf.identity(summary)]
    def _training(self):
        """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('training'):
            assert_full = tf.assert_equal(self._memory_index,
                                          self._config.update_every)
            with tf.control_dependencies([assert_full]):
                data = self._memory.data()
            (observ, action, old_mean, old_logstd, reward), length = data
            with tf.control_dependencies([tf.assert_greater(length, 0)]):
                length = tf.identity(length)
            observ = self._observ_filter.transform(observ)
            reward = self._reward_filter.transform(reward)
            update_summary = self._perform_update_steps(
                observ, action, old_mean, old_logstd, reward, length)
            with tf.control_dependencies([update_summary]):
                penalty_summary = self._adjust_penalty(observ, old_mean,
                                                       old_logstd, length)
            with tf.control_dependencies([penalty_summary]):
                clear_memory = tf.group(self._memory.clear(),
                                        self._memory_index.assign(0))
            with tf.control_dependencies([clear_memory]):
                weight_summary = utility.variable_summaries(
                    tf.trainable_variables(), self._config.weight_summaries)
                return tf.summary.merge(
                    [update_summary, penalty_summary, weight_summary])
Example #3
0
    def perform(self, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      observ: Tensor of a batch of observations for all algorithms.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            network = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                    self._last_state)
            action = tf.cond(self._is_training, network.policy.sample,
                             lambda: network.mean)
            logprob = network.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', network.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            with tf.control_dependencies([
                    utility.assign_nested_vars(self._last_state,
                                               network.state),
                    self._last_action.assign(action[:, 0]),
                    self._last_mean.assign(network.mean[:, 0]),
                    self._last_logstd.assign(network.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Example #5
0
 def clear(self):
     """Return the mean estimate and reset the streaming statistics."""
     value = self._sum / tf.cast(self._count, self._dtype)
     with tf.control_dependencies([value]):
         reset_value = self._sum.assign(tf.zeros_like(self._sum))
         reset_count = self._count.assign(0)
     with tf.control_dependencies([reset_value, reset_count]):
         return tf.identity(value)
Example #6
0
    def _update_value_step(self, observ, reward, length):
        """Compute the current value loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        loss, summary = self._value_loss(observ, reward, length)
        gradients, variables = (zip(
            *self._value_optimizer.compute_gradients(loss)))
        optimize = self._value_optimizer.apply_gradients(
            zip(gradients, variables))
        summary = tf.summary.merge([
            summary,
            tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
            utility.gradient_summaries(zip(gradients, variables),
                                       dict(value=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(loss), tf.identity(summary)]
Example #7
0
    def reset(self):
        """Reset the environment.

    Returns:
      Tensor of the current observation.
    """
        observ_dtype = self._parse_dtype(self._env.observation_space)
        observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
        observ = tf.check_numerics(observ, 'observ')
        with tf.control_dependencies([
                self._observ.assign(observ),
                self._reward.assign(0),
                self._done.assign(False)
        ]):
            return tf.identity(observ)
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
    def _update_step(self, observ, action, old_mean, old_logstd, reward,
                     advantage, length):
        """Compute the current combined loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of reward.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of value loss, policy loss, and summary tensor.
    """
        value_loss, value_summary = self._value_loss(observ, reward, length)
        network = self._network(observ, length)
        policy_loss, policy_summary = self._policy_loss(
            network.mean, network.logstd, old_mean, old_logstd, action,
            advantage, length)
        value_gradients, value_variables = (zip(
            *self._optimizer.compute_gradients(value_loss)))
        policy_gradients, policy_variables = (zip(
            *self._optimizer.compute_gradients(policy_loss)))
        all_gradients = value_gradients + policy_gradients
        all_variables = value_variables + policy_variables
        optimize = self._optimizer.apply_gradients(
            zip(all_gradients, all_variables))
        summary = tf.summary.merge([
            value_summary, policy_summary,
            tf.summary.scalar('value_gradient_norm',
                              tf.global_norm(value_gradients)),
            tf.summary.scalar('policy_gradient_norm',
                              tf.global_norm(policy_gradients)),
            utility.gradient_summaries(zip(value_gradients, value_variables),
                                       dict(value=r'.*')),
            utility.gradient_summaries(zip(policy_gradients, policy_variables),
                                       dict(policy=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
Example #10
0
    def reset(self, indices=None):
        """Reset the batch of environments.

    Args:
      indices: The batch indices of the environments to reset; defaults to all.

    Returns:
      Batch tensor of the new observations.
    """
        if indices is None:
            indices = tf.range(len(self._batch_env))
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        observ = tf.py_func(self._batch_env.reset, [indices],
                            observ_dtype,
                            name='reset')
        observ = tf.check_numerics(observ, 'observ')
        reward = tf.zeros_like(indices, tf.float32)
        done = tf.zeros_like(indices, tf.bool)
        with tf.control_dependencies([
                tf.scatter_update(self._observ, indices, observ),
                tf.scatter_update(self._reward, indices, reward),
                tf.scatter_update(self._done, indices, done)
        ]):
            return tf.identity(observ)
Example #11
0
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary