def _define_experience(self, agent_indices, observ, action, reward):
     """Implement the branch of experience() entered during training."""
     update_filters = tf.summary.merge([
         self._observ_filter.update(observ),
         self._reward_filter.update(reward)
     ])
     with tf.control_dependencies([update_filters]):
         if self._config.train_on_agent_action:
             # NOTE: Doesn't seem to change much.
             action = self._last_action
         batch = (observ, action, tf.gather(self._last_mean, agent_indices),
                  tf.gather(self._last_logstd, agent_indices), reward)
         append = self._episodes.append(batch, agent_indices)
     with tf.control_dependencies([append]):
         norm_observ = self._observ_filter.transform(observ)
         norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
         # pylint: disable=g-long-lambda
         summary = tf.cond(
             self._should_log, lambda: tf.summary.merge([
                 update_filters,
                 self._observ_filter.summary(),
                 self._reward_filter.summary(),
                 tf.summary.scalar('memory_size', self._memory_index),
                 tf.summary.histogram('normalized_observ', norm_observ),
                 tf.summary.histogram('action', self._last_action),
                 tf.summary.scalar('normalized_reward', norm_reward)
             ]), str)
         return summary
Exemple #2
0
    def append(self, transitions, rows=None):
        """Append a batch of transitions to rows of the memory.

    Args:
      transitions: Tuple of transition quantities with batch dimension.
      rows: Episodes to append to, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less(tf.gather(self._length, rows),
                                               self._max_length,
                                               message='max length exceeded')
        append_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, transitions):
                timestep = tf.gather(self._length, rows)
                indices = tf.stack([rows, timestep], 1)
                append_ops.append(
                    tf.scatter_nd_update(buffer_, indices, elements))
        with tf.control_dependencies(append_ops):
            episode_mask = tf.reduce_sum(
                tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
            return self._length.assign_add(episode_mask)
 def _define_end_episode(self, agent_indices):
     """Implement the branch of end_episode() entered during training."""
     episodes, length = self._episodes.data(agent_indices)
     space_left = self._config.update_every - self._memory_index
     use_episodes = tf.range(
         tf.minimum(tf.shape(agent_indices)[0], space_left))
     episodes = [tf.gather(elem, use_episodes) for elem in episodes]
     append = self._memory.replace(episodes,
                                   tf.gather(length, use_episodes),
                                   use_episodes + self._memory_index)
     with tf.control_dependencies([append]):
         inc_index = self._memory_index.assign_add(
             tf.shape(use_episodes)[0])
     with tf.control_dependencies([inc_index]):
         memory_full = self._memory_index >= self._config.update_every
         return tf.cond(memory_full, self._training, str)
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Exemple #5
0
    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)
Exemple #6
0
    def data(self, rows=None):
        """Access a batch of episodes from the memory.

    Padding elements after the length of each episode are unspecified and might
    contain old data.

    Args:
      rows: Episodes to select, defaults to all.

    Returns:
      Tuple containing a tuple of transition quantiries with batch and time
      dimensions, and a batch of sequence lengths.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
        length = tf.gather(self._length, rows)
        return episode, length
Exemple #7
0
    def length(self, rows=None):
        """Tensor holding the current length of episodes.

    Args:
      rows: Episodes to select length from, defaults to all.

    Returns:
      Batch tensor of sequence lengths.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        return tf.gather(self._length, rows)
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)