class AdversarialEnvironmentScalar(tf_metric.TFStepMetric): """Metric to compute average of simple scalars like number of obstacles.""" def __init__(self, name, prefix='Metrics', dtype=tf.float32, batch_size=1, buffer_size=10): super(AdversarialEnvironmentScalar, self).__init__(name=name, prefix=prefix) self._buffer = TFDeque(buffer_size, dtype) self._dtype = dtype @common.function(autograph=True) def call(self, new_scalar_vals): for v in new_scalar_vals: self._buffer.add(v) return new_scalar_vals def result(self): return self._buffer.mean() @common.function def reset(self): self._buffer.clear()
class AverageReturnMetric(tf_metric.TFStepMetric): """Metric for the average collective return and individual agent returns.""" def __init__(self, n_agents, name='MultiagentAverageReturn', prefix='Metrics', dtype=tf.float32, batch_size=1, buffer_size=10): super(AverageReturnMetric, self).__init__(name=name, prefix=prefix) self.n_agents = n_agents self._dtype = dtype # Accumulator and buffer for the average return of all agents self._collective_return_accumulator = common.create_variable( initial_value=0, dtype=dtype, shape=(batch_size, ), name='Accumulator') self._collective_buffer = TFDeque(buffer_size, dtype) # Accumulators for each agent's independent reward self._agent_return_accumulators = [] for a in range(n_agents): self._agent_return_accumulators.append( common.create_variable(initial_value=0, dtype=dtype, shape=(batch_size, ), name='Accumulator' + str(a))) # Buffers for each agent's independent reward self._agent_buffers = [] for a in range(n_agents): self._agent_buffers.append(TFDeque(buffer_size, dtype)) @common.function(autograph=True) def call(self, trajectory): # Zero out batch indices where a new episode is starting. self._collective_return_accumulator.assign( zero_out_new_episodes(trajectory, self._collective_return_accumulator)) for a in range(self.n_agents): self._agent_return_accumulators[a].assign( zero_out_new_episodes(trajectory, self._agent_return_accumulators[a])) # Note that trajectory.reward has shape (batch, n_agents) # Update accumulator with sum of received rewards. self._collective_return_accumulator.assign_add( tf.reduce_mean(trajectory.reward, axis=1)) # Pull out data for each agent and assign for a in range(self.n_agents): self._agent_return_accumulators[a].assign_add(trajectory.reward[:, a]) # Add final returns to buffer. last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1) for indx in last_episode_indices: self._collective_buffer.add( self._collective_return_accumulator[indx]) # Agent buffers that use the global done for a in range(self.n_agents): self._agent_buffers[a].add( self._agent_return_accumulators[a][indx]) return trajectory def result(self): return self._collective_buffer.mean() def result_for_agent(self, agent_id): return self._agent_buffers[agent_id].mean() @common.function def reset(self): self._collective_buffer.clear() self._collective_return_accumulator.assign( tf.zeros_like(self._collective_return_accumulator)) for a in range(self.n_agents): self._agent_buffers[a].clear() self._agent_return_accumulators[a].assign( tf.zeros_like(self._agent_return_accumulators[a])) def tf_summaries(self, train_step=None, step_metrics=()): """Generates summaries for all agents & collective summary against steps. Args: train_step: (Optional) Step counter for training iterations. If None, no metric is generated against the global step. step_metrics: (Optional) Iterable of step metrics to generate summaries against. Returns: A list of summaries. """ summaries = super(AverageReturnMetric, self).tf_summaries(train_step=train_step, step_metrics=step_metrics) for a in range(self.n_agents): summaries.extend( self.single_agent_summary(a, train_step, step_metrics)) return summaries def single_agent_summary(self, agent_id, train_step=None, step_metrics=()): summaries = [] prefix = self._prefix name = self.name + '_agent' + str(agent_id) tag = common.join_scope(prefix, name) result = self.result_for_agent(agent_id) if train_step is not None: summaries.append( tf.compat.v2.summary.scalar(name=tag, data=result, step=train_step)) if prefix: prefix += '_' for step_metric in step_metrics: # Skip plotting the metrics against itself. if self.name == step_metric.name: continue step_tag = '{}vs_{}/{}'.format(prefix, step_metric.name, name) # Summaries expect the step value to be an int64. step = tf.cast(step_metric.result(), tf.int64) summaries.append( tf.compat.v2.summary.scalar(name=step_tag, data=result, step=step)) return summaries
class ActionProbabilityMetric(TFStepMetric): """ A metric that records the average action probabilities over a given period. Implementation similar to tf_agent.metrics.tf_metrics.AverageReturnMetric """ def __init__(self, policy: tf_policy.TFPolicy, action_indices: Tuple[int, ...], name: str = 'ActionProbability', prefix: str = 'Metrics', dtype: Type = tf.float32, batch_size: int = 1, buffer_size: int = 10): """ :param policy: Policy of the agent used for reevaluation to attain action probabilities at each time step. :param action_indices: A tuple of indices of the action probability vector to track. This is a tuple to allow for the case where the action is a tuple of tensors. :param name: Name of the metric (as it will appear in tensorboard). :param prefix: Prefix to apply as part of the naming convention. :param dtype: Data type of the metric. :param batch_size: Batch size of the RL environment. :param buffer_size: The capacity of the buffer which will rewrite itself when full but is emptied at every logging point. """ super().__init__(name=name, prefix=prefix) self._action_indices = action_indices self._dtype = dtype self._probability_accumulator = common.create_variable( initial_value=0, dtype=dtype, shape=(batch_size,), name='Accumulator' ) self._policy = policy self._buffer = TFDeque(buffer_size, dtype) self._count_accumulator = common.create_variable( initial_value=0, dtype=dtype, shape=(batch_size,), name='CountAccumulator' ) @common.function(autograph=True) def call(self, trajectory: Trajectory) -> Trajectory: time_step = TimeStep(trajectory.step_type, trajectory.reward, trajectory.discount, trajectory.observation) action_dist = self._policy.distribution(time_step).action # If the action distribution is in fact a tuple of distributions (one for each resource set) # then we need to index into them to attain the underlying distribution which can then be # used to attain probabilities. This is only the case where there are multiple resource # sets. for i in self._action_indices[:-1]: action_dist = action_dist[i] action_probs = action_dist.probs_parameter() # Zero out batch indices where a new episode is starting. self._probability_accumulator.assign( tf.where(trajectory.is_first(), tf.zeros_like(self._probability_accumulator), self._probability_accumulator)) self._count_accumulator.assign( tf.where(trajectory.is_first(), tf.zeros_like(self._count_accumulator), self._count_accumulator)) # Update accumulators with probability and count increments. self._probability_accumulator.assign_add(action_probs[..., 0, self._action_indices[-1]]) self._count_accumulator.assign_add(tf.ones_like(self._count_accumulator)) # Add final cumulants to buffer at the end of episodes. last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1) for idx in last_episode_indices: self._buffer.add(self._probability_accumulator[idx] / self._count_accumulator[idx]) return trajectory def result(self) -> tf.Tensor: """Return the metric value.""" return self._buffer.mean() @common.function def reset(self) -> None: """Clear the buffer and reset the accumulators.""" self._buffer.clear() self._probability_accumulator.assign(tf.zeros_like(self._probability_accumulator)) self._count_accumulator.assign(tf.zeros_like(self._count_accumulator)) @property def action_indices(self) -> Tuple[int, ...]: return self._action_indices