Ejemplo n.º 1
0
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tools.nested.map(lambda x: tf.gather(x, agent_indices),
                                         self._last_state)
            with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'):
                output = self._network(observ[:, None],
                                       tf.ones(observ.shape[0]), state)
            action = tf.cond(self._is_training, output.policy.sample,
                             output.policy.mode)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mode',
                                         output.policy.mode()[:, 0]),
                    tf.summary.histogram('std',
                                         output.policy.stddev()[:, 0]),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            remember_last_action = tf.scatter_update(self._last_action,
                                                     agent_indices, action[:,
                                                                           0])
            policy_params = tools.nested.filter(
                lambda x: isinstance(x, tf.Tensor), output.policy.parameters)
            assert policy_params, 'Policy has no parameters to store.'
            remember_last_policy = tools.nested.map(
                lambda var, val: tf.scatter_update(var, agent_indices, val[:, 0
                                                                           ]),
                self._last_policy,
                policy_params,
                flatten=True)
            with tf.control_dependencies((assign_state, remember_last_action) +
                                         remember_last_policy):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
Ejemplo n.º 2
0
  def perform(self, agent_indices, observ):
    """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
    with tf.name_scope('perform/'):
      observ = self._observ_filter.transform(observ)
      if self._last_state is None:
        state = None
      else:
        state = tools.nested.map(
            lambda x: tf.gather(x, agent_indices), self._last_state)
      with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'):
        output = self._network(
            observ[:, None], tf.ones(observ.shape[0]), state)
      action = tf.cond(
          self._is_training, output.policy.sample, output.policy.mode)
      logprob = output.policy.log_prob(action)[:, 0]
      # pylint: disable=g-long-lambda
      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
          tf.summary.histogram('mode', output.policy.mode()[:, 0]),
          tf.summary.histogram('action', action[:, 0]),
          tf.summary.histogram('logprob', logprob)]), str)
      # Remember current policy to append to memory in the experience callback.
      if self._last_state is None:
        assign_state = tf.no_op()
      else:
        assign_state = utility.assign_nested_vars(
            self._last_state, output.state, agent_indices)
      remember_last_action = tf.scatter_update(
          self._last_action, agent_indices, action[:, 0])
      policy_params = tools.nested.filter(
          lambda x: isinstance(x, tf.Tensor), output.policy.parameters)
      assert policy_params, 'Policy has no parameters to store.'
      remember_last_policy = tools.nested.map(
          lambda var, val: tf.scatter_update(var, agent_indices, val[:, 0]),
          self._last_policy, policy_params, flatten=True)
      with tf.control_dependencies((
          assign_state, remember_last_action) + remember_last_policy):
        return action[:, 0], tf.identity(summary)
Ejemplo n.º 3
0
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

        Args:
          agent_indices: Tensor containing current batch indices.
          observ: Tensor of a batch of observations for all agents.

        Returns:
          Tuple of action batch tensor and summary tensor.
        """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tools.nested.map(lambda x: tf.gather(x, agent_indices),
                                         self._last_state)
            with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'):
                output = self._network(observ[:, None],
                                       tf.ones(observ.shape[0]), state)

            # policy
            sample_ = tf.concat([
                tf.zeros(shape=[observ.shape[0], 1, 13], dtype=tf.float32),
                output.policy[ACT['DEF_DASH']].sample()
            ],
                                axis=2)
            mode_ = tf.concat([
                tf.zeros(shape=[observ.shape[0], 1, 13], dtype=tf.float32),
                output.policy[ACT['DEF_DASH']].mode()
            ],
                              axis=2)
            action = tf.where(self._is_training, sample_, mode_)
            logprob = output.policy[ACT['DEF_DASH']].log_prob(action[:, :,
                                                                     13:23])[:,
                                                                             0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mode', mode_[:, 0, 13:23]),
                    tf.summary.histogram('DEF_DASH', action[:, 0, 13:23]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)

            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            remember_last_action = tf.scatter_update(self._last_action,
                                                     agent_indices, action[:,
                                                                           0])

            def is_tensor(x):
                return isinstance(x, tf.Tensor)

            policy_params = []
            remember_last_policy = tuple()
            for i in range(1):
                policy_params.append(
                    tools.nested.filter(is_tensor,
                                        output.policy[i].parameters))
                remember_last_policy += tools.nested.map(
                    lambda var, val: tf.scatter_update(var, agent_indices,
                                                       val[:, 0]),
                    self._last_policy[i],
                    policy_params[i],
                    flatten=True)
            assert policy_params, 'Policy has no parameters to store.'
            with tf.control_dependencies((assign_state, remember_last_action) +
                                         remember_last_policy):
                return action[:, 0], tf.identity(summary)