def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) network = self._network( observ[:, None], tf.ones(observ.shape[0]), self._last_state) action = tf.cond( self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond(self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob)]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0])]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tf.contrib.framework.nest.map_structure( lambda x: tf.gather(x, agent_indices), self._last_state) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', output.mean[:, 0]), tf.summary.histogram('std', tf.exp(output.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) with tf.control_dependencies([ assign_state, tf.scatter_update(self._last_action, agent_indices, action[:, 0]), tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]), tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) # observ[:, None], turns an array of observations [o], into an array of list of # observations [[o]], this shape is needed by the underlying RNN. # The second parameter tf.ones(observ.shape[0]) says that the length of each of # the inner list of [[o]] is 1, which is obvious. # observ[:, None] is of shape=(num_environments, 1, action_dimension) # tf.ones(observ.shape[0]) tells that for every environment, the length of observation is 1. network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state) # If we are training mode, we sample from the policy to make sure exploration. action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)