Ejemplo n.º 1
0
class AdversarialEnvironmentScalar(tf_metric.TFStepMetric):
    """Metric to compute average of simple scalars like number of obstacles."""
    def __init__(self,
                 name,
                 prefix='Metrics',
                 dtype=tf.float32,
                 batch_size=1,
                 buffer_size=10):
        super(AdversarialEnvironmentScalar, self).__init__(name=name,
                                                           prefix=prefix)
        self._buffer = TFDeque(buffer_size, dtype)
        self._dtype = dtype

    @common.function(autograph=True)
    def call(self, new_scalar_vals):
        for v in new_scalar_vals:
            self._buffer.add(v)
        return new_scalar_vals

    def result(self):
        return self._buffer.mean()

    @common.function
    def reset(self):
        self._buffer.clear()
Ejemplo n.º 2
0
class AverageReturnMetric(tf_metric.TFStepMetric):
    """Metric for the average collective return and individual agent returns."""
    def __init__(self,
                 n_agents,
                 name='MultiagentAverageReturn',
                 prefix='Metrics',
                 dtype=tf.float32,
                 batch_size=1,
                 buffer_size=10):
        super(AverageReturnMetric, self).__init__(name=name, prefix=prefix)
        self.n_agents = n_agents
        self._dtype = dtype

        # Accumulator and buffer for the average return of all agents
        self._collective_return_accumulator = common.create_variable(
            initial_value=0,
            dtype=dtype,
            shape=(batch_size, ),
            name='Accumulator')
        self._collective_buffer = TFDeque(buffer_size, dtype)

        # Accumulators for each agent's independent reward
        self._agent_return_accumulators = []
        for a in range(n_agents):
            self._agent_return_accumulators.append(
                common.create_variable(initial_value=0,
                                       dtype=dtype,
                                       shape=(batch_size, ),
                                       name='Accumulator' + str(a)))

        # Buffers for each agent's independent reward
        self._agent_buffers = []
        for a in range(n_agents):
            self._agent_buffers.append(TFDeque(buffer_size, dtype))

    @common.function(autograph=True)
    def call(self, trajectory):
        # Zero out batch indices where a new episode is starting.
        self._collective_return_accumulator.assign(
            zero_out_new_episodes(trajectory,
                                  self._collective_return_accumulator))
        for a in range(self.n_agents):
            self._agent_return_accumulators[a].assign(
                zero_out_new_episodes(trajectory,
                                      self._agent_return_accumulators[a]))

        # Note that trajectory.reward has shape (batch, n_agents)

        # Update accumulator with sum of received rewards.
        self._collective_return_accumulator.assign_add(
            tf.reduce_mean(trajectory.reward, axis=1))

        # Pull out data for each agent and assign
        for a in range(self.n_agents):
            self._agent_return_accumulators[a].assign_add(trajectory.reward[:,
                                                                            a])

        # Add final returns to buffer.
        last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()),
                                          axis=-1)
        for indx in last_episode_indices:
            self._collective_buffer.add(
                self._collective_return_accumulator[indx])

            # Agent buffers that use the global done
            for a in range(self.n_agents):
                self._agent_buffers[a].add(
                    self._agent_return_accumulators[a][indx])

        return trajectory

    def result(self):
        return self._collective_buffer.mean()

    def result_for_agent(self, agent_id):
        return self._agent_buffers[agent_id].mean()

    @common.function
    def reset(self):
        self._collective_buffer.clear()
        self._collective_return_accumulator.assign(
            tf.zeros_like(self._collective_return_accumulator))

        for a in range(self.n_agents):
            self._agent_buffers[a].clear()
            self._agent_return_accumulators[a].assign(
                tf.zeros_like(self._agent_return_accumulators[a]))

    def tf_summaries(self, train_step=None, step_metrics=()):
        """Generates summaries for all agents & collective summary against steps.

    Args:
      train_step: (Optional) Step counter for training iterations. If None, no
        metric is generated against the global step.
      step_metrics: (Optional) Iterable of step metrics to generate summaries
        against.

    Returns:
      A list of summaries.
    """
        summaries = super(AverageReturnMetric,
                          self).tf_summaries(train_step=train_step,
                                             step_metrics=step_metrics)

        for a in range(self.n_agents):
            summaries.extend(
                self.single_agent_summary(a, train_step, step_metrics))

        return summaries

    def single_agent_summary(self, agent_id, train_step=None, step_metrics=()):
        summaries = []
        prefix = self._prefix
        name = self.name + '_agent' + str(agent_id)
        tag = common.join_scope(prefix, name)

        result = self.result_for_agent(agent_id)

        if train_step is not None:
            summaries.append(
                tf.compat.v2.summary.scalar(name=tag,
                                            data=result,
                                            step=train_step))
        if prefix:
            prefix += '_'
        for step_metric in step_metrics:
            # Skip plotting the metrics against itself.
            if self.name == step_metric.name:
                continue
            step_tag = '{}vs_{}/{}'.format(prefix, step_metric.name, name)
            # Summaries expect the step value to be an int64.
            step = tf.cast(step_metric.result(), tf.int64)
            summaries.append(
                tf.compat.v2.summary.scalar(name=step_tag,
                                            data=result,
                                            step=step))
        return summaries
Ejemplo n.º 3
0
class ActionProbabilityMetric(TFStepMetric):
    """
    A metric that records the average action probabilities over a given period.
    Implementation similar to tf_agent.metrics.tf_metrics.AverageReturnMetric
    """

    def __init__(self,
                 policy: tf_policy.TFPolicy,
                 action_indices: Tuple[int, ...],
                 name: str = 'ActionProbability',
                 prefix: str = 'Metrics',
                 dtype: Type = tf.float32,
                 batch_size: int = 1,
                 buffer_size: int = 10):
        """
        :param policy: Policy of the agent used for reevaluation to attain action probabilities at
            each time step.
        :param action_indices: A tuple of indices of the action probability vector to track. This is
            a tuple to allow for the case where the action is a tuple of tensors.
        :param name: Name of the metric (as it will appear in tensorboard).
        :param prefix: Prefix to apply as part of the naming convention.
        :param dtype: Data type of the metric.
        :param batch_size: Batch size of the RL environment.
        :param buffer_size: The capacity of the buffer which will rewrite itself when full but is
            emptied at every logging point.
        """
        super().__init__(name=name, prefix=prefix)
        self._action_indices = action_indices
        self._dtype = dtype
        self._probability_accumulator = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='Accumulator'
        )
        self._policy = policy
        self._buffer = TFDeque(buffer_size, dtype)
        self._count_accumulator = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='CountAccumulator'
        )

    @common.function(autograph=True)
    def call(self, trajectory: Trajectory) -> Trajectory:
        time_step = TimeStep(trajectory.step_type, trajectory.reward, trajectory.discount,
                             trajectory.observation)
        action_dist = self._policy.distribution(time_step).action

        # If the action distribution is in fact a tuple of distributions (one for each resource set)
        # then we need to index into them to attain the underlying distribution which can then be
        # used to attain probabilities. This is only the case where there are multiple resource
        # sets.
        for i in self._action_indices[:-1]:
            action_dist = action_dist[i]

        action_probs = action_dist.probs_parameter()
        # Zero out batch indices where a new episode is starting.
        self._probability_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._probability_accumulator),
                     self._probability_accumulator))
        self._count_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._count_accumulator),
                     self._count_accumulator))
        # Update accumulators with probability and count increments.
        self._probability_accumulator.assign_add(action_probs[..., 0, self._action_indices[-1]])
        self._count_accumulator.assign_add(tf.ones_like(self._count_accumulator))

        # Add final cumulants to buffer at the end of episodes.
        last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1)
        for idx in last_episode_indices:
            self._buffer.add(self._probability_accumulator[idx] / self._count_accumulator[idx])

        return trajectory

    def result(self) -> tf.Tensor:
        """Return the metric value."""
        return self._buffer.mean()

    @common.function
    def reset(self) -> None:
        """Clear the buffer and reset the accumulators."""
        self._buffer.clear()
        self._probability_accumulator.assign(tf.zeros_like(self._probability_accumulator))
        self._count_accumulator.assign(tf.zeros_like(self._count_accumulator))

    @property
    def action_indices(self) -> Tuple[int, ...]:
        return self._action_indices