def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tools.nested.map(lambda x: tf.gather(x, agent_indices), self._last_state) with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'): output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, output.policy.mode) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mode', output.policy.mode()[:, 0]), tf.summary.histogram('std', output.policy.stddev()[:, 0]), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) remember_last_action = tf.scatter_update(self._last_action, agent_indices, action[:, 0]) policy_params = tools.nested.filter( lambda x: isinstance(x, tf.Tensor), output.policy.parameters) assert policy_params, 'Policy has no parameters to store.' remember_last_policy = tools.nested.map( lambda var, val: tf.scatter_update(var, agent_indices, val[:, 0 ]), self._last_policy, policy_params, flatten=True) with tf.control_dependencies((assign_state, remember_last_action) + remember_last_policy): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tools.nested.map( lambda x: tf.gather(x, agent_indices), self._last_state) with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'): output = self._network( observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond( self._is_training, output.policy.sample, output.policy.mode) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond(self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mode', output.policy.mode()[:, 0]), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob)]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) remember_last_action = tf.scatter_update( self._last_action, agent_indices, action[:, 0]) policy_params = tools.nested.filter( lambda x: isinstance(x, tf.Tensor), output.policy.parameters) assert policy_params, 'Policy has no parameters to store.' remember_last_policy = tools.nested.map( lambda var, val: tf.scatter_update(var, agent_indices, val[:, 0]), self._last_policy, policy_params, flatten=True) with tf.control_dependencies(( assign_state, remember_last_action) + remember_last_policy): return action[:, 0], tf.identity(summary)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tools.nested.map(lambda x: tf.gather(x, agent_indices), self._last_state) with tf.device('/gpu:0' if self._use_gpu else '/cpu:0'): output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) # policy sample_ = tf.concat([ tf.zeros(shape=[observ.shape[0], 1, 13], dtype=tf.float32), output.policy[ACT['DEF_DASH']].sample() ], axis=2) mode_ = tf.concat([ tf.zeros(shape=[observ.shape[0], 1, 13], dtype=tf.float32), output.policy[ACT['DEF_DASH']].mode() ], axis=2) action = tf.where(self._is_training, sample_, mode_) logprob = output.policy[ACT['DEF_DASH']].log_prob(action[:, :, 13:23])[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mode', mode_[:, 0, 13:23]), tf.summary.histogram('DEF_DASH', action[:, 0, 13:23]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) remember_last_action = tf.scatter_update(self._last_action, agent_indices, action[:, 0]) def is_tensor(x): return isinstance(x, tf.Tensor) policy_params = [] remember_last_policy = tuple() for i in range(1): policy_params.append( tools.nested.filter(is_tensor, output.policy[i].parameters)) remember_last_policy += tools.nested.map( lambda var, val: tf.scatter_update(var, agent_indices, val[:, 0]), self._last_policy[i], policy_params[i], flatten=True) assert policy_params, 'Policy has no parameters to store.' with tf.control_dependencies((assign_state, remember_last_action) + remember_last_policy): return action[:, 0], tf.identity(summary)