예제 #1
  def perform(self, observ):
    """Compute batch of actions and a summary for a batch of observation.

      observ: Tensor of a batch of observations for all agents.

      Tuple of action batch tensor and summary tensor.
    with tf.name_scope('perform/'):
      observ = self._observ_filter.transform(observ)
      network = self._network(
          observ[:, None], tf.ones(observ.shape[0]), self._last_state)
      action = tf.cond(
          self._is_training, network.policy.sample, lambda: network.mean)
      logprob = network.policy.log_prob(action)[:, 0]
      # pylint: disable=g-long-lambda
      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
          tf.summary.histogram('mean', network.mean[:, 0]),
          tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
          tf.summary.histogram('action', action[:, 0]),
          tf.summary.histogram('logprob', logprob)]), str)
      # Remember current policy to append to memory in the experience callback.
      with tf.control_dependencies([
          utility.assign_nested_vars(self._last_state, network.state),
          self._last_action.assign(action[:, 0]),
          self._last_mean.assign(network.mean[:, 0]),
          self._last_logstd.assign(network.logstd[:, 0])]):
        return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
예제 #2
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

      Tuple of action batch tensor and summary tensor.
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            use_gpu = self._config.use_gpu and utility.available_gpus()
            with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
                output = self._network(observ[:, None],
                                       tf.ones(observ.shape[0]), state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
예제 #3
    def perform(self, observ):
        """Compute batch of actions and a summary for a batch of observation.

      observ: Tensor of a batch of observations for all agents.

      Tuple of action batch tensor and summary tensor.
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            # observ[:, None], turns an array of observations [o], into an array of list of
            # observations [[o]], this shape is needed by the underlying RNN.
            # The second parameter tf.ones(observ.shape[0]) says that the length of each of
            # the inner list of [[o]] is 1, which is obvious.

            # observ[:, None] is of shape=(num_environments, 1, action_dimension)
            # tf.ones(observ.shape[0]) tells that for every environment, the length of observation is 1.
            network = self._network(observ[:, None], tf.ones(observ.shape[0]),

            # If we are training mode, we sample from the policy to make sure exploration.
            action = tf.cond(self._is_training, network.policy.sample,
                             lambda: network.mean)
            logprob = network.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', network.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            with tf.control_dependencies([
                    self._last_action.assign(action[:, 0]),
                    self._last_mean.assign(network.mean[:, 0]),
                    self._last_logstd.assign(network.logstd[:, 0])
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)