예제 #1
0
    def signature(cls,
                  environment_spec: specs.EnvironmentSpec,
                  extras_spec: types.NestedSpec = ()):

        # This function currently assumes that self._discount is a scalar.
        # If it ever becomes a nested structure and/or a np.ndarray, this method
        # will need to know its structure / shape. This is because the signature
        # discount shape is the environment's discount shape and this adder's
        # discount shape broadcasted together. Also, the reward shape is this
        # signature discount shape broadcasted together with the environment
        # reward shape. As long as self._discount is a scalar, it will not affect
        # either the signature discount shape nor the signature reward shape, so we
        # can ignore it.

        rewards_spec, step_discounts_spec = tree_utils.broadcast_structures(
            environment_spec.rewards, environment_spec.discounts)
        rewards_spec = tree.map_structure(_broadcast_specs, rewards_spec,
                                          step_discounts_spec)
        step_discounts_spec = tree.map_structure(copy.deepcopy,
                                                 step_discounts_spec)

        transition_spec = types.Transition(
            environment_spec.observations,
            environment_spec.actions,
            rewards_spec,
            step_discounts_spec,
            environment_spec.observations,  # next_observation
            extras_spec)

        return tree.map_structure_with_path(base.spec_like_to_tensor_spec,
                                            transition_spec)
예제 #2
0
    def _compute_cumulative_quantities(
        self, history: types.NestedArray
    ) -> Tuple[types.NestedArray, types.NestedArray]:

        first_step, *next_steps = tree_utils.unstack_sequence_fields(
            history, self._n_step)

        # Give the same tree structure to the n-step return accumulator,
        # n-step discount accumulator, and self.discount, so that they can be
        # iterated in parallel using tree.map_structure.
        (n_step_return, total_discount,
         self_discount) = tree_utils.broadcast_structures(
             first_step['reward'], first_step['discount'], self._discount)

        # Copy total_discount as it is otherwise read-only.
        total_discount = tree.map_structure(np.copy, total_discount)

        # Broadcast n_step_return to have the broadcasted shape of
        # reward * discount.
        n_step_return = tree.map_structure(
            lambda r, d: np.copy(np.broadcast_to(r,
                                                 np.broadcast(r, d).shape)),
            n_step_return, total_discount)

        # NOTE: total discount will have one less discount than it does
        # step.discounts. This is so that when the learner/update uses an additional
        # discount we don't apply it twice. Inside the following loop we will
        # apply this right before summing up the n_step_return.
        for step in next_steps:
            (step_discount, step_reward,
             total_discount) = tree_utils.broadcast_structures(
                 step['discount'], step['reward'], total_discount)

            # Equivalent to: `total_discount *= self._discount`.
            tree.map_structure(operator.imul, total_discount, self_discount)

            # Equivalent to: `n_step_return += step.reward * total_discount`.
            tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td),
                               n_step_return, step_reward, total_discount)

            # Equivalent to: `total_discount *= step.discount`.
            tree.map_structure(operator.imul, total_discount, step_discount)

        return n_step_return, total_discount
예제 #3
0
    def signature(
        cls,
        environment_spec: mava_specs.EnvironmentSpec,
        extras_spec: tf.TypeSpec = {},
    ) -> tf.TypeSpec:

        # This function currently assumes that self._discount is a scalar.
        # If it ever becomes a nested structure and/or a np.ndarray, this method
        # will need to know its structure / shape. This is because the signature
        # discount shape is the environment's discount shape and this adder's
        # discount shape broadcasted together. Also, the reward shape is this
        # signature discount shape broadcasted together with the environment
        # reward shape. As long as self._discount is a scalar, it will not affect
        # either the signature discount shape nor the signature reward shape, so we
        # can ignore it.

        agent_specs = environment_spec.get_agent_specs()
        agents = environment_spec.get_agent_ids()
        env_extras_spec = environment_spec.get_extra_specs()
        extras_spec.update(env_extras_spec)

        obs_specs = {}
        act_specs = {}
        reward_specs = {}
        step_discount_specs = {}
        for agent in agents:

            rewards_spec, step_discounts_spec = tree_utils.broadcast_structures(
                agent_specs[agent].rewards, agent_specs[agent].discounts
            )

            rewards_spec = tree.map_structure(
                _broadcast_specs, rewards_spec, step_discounts_spec
            )
            step_discounts_spec = tree.map_structure(copy.deepcopy, step_discounts_spec)

            obs_specs[agent] = agent_specs[agent].observations
            act_specs[agent] = agent_specs[agent].actions
            reward_specs[agent] = rewards_spec
            step_discount_specs[agent] = step_discounts_spec

        transition_spec = [
            obs_specs,
            act_specs,
            extras_spec,
            reward_specs,
            step_discount_specs,
            obs_specs,  # next_observation
            extras_spec,
        ]

        return tree.map_structure_with_path(
            base.spec_like_to_tensor_spec, tuple(transition_spec)
        )
예제 #4
0
    def signature(
        cls,
        environment_spec: specs.EnvironmentSpec,
        extras_spec: tf.TypeSpec = {},
    ) -> tf.TypeSpec:
        """This is a helper method for generating signatures for Reverb tables.
        Signatures are useful for validating data types and shapes, see Reverb's
        documentation for details on how they are used.
        Args:
          environment_spec: A `specs.EnvironmentSpec` whose fields are nested
            structures with leaf nodes that have `.shape` and `.dtype` attributes.
            This should come from the environment that will be used to generate
            the data inserted into the Reverb table.
          extras_spec: A nested structure with leaf nodes that have `.shape` and
            `.dtype` attributes. The structure (and shapes/dtypes) of this must
            be the same as the `extras` passed into `ReverbAdder.add`.
        Returns:
          A `Step` whose leaf nodes are `tf.TensorSpec` objects.
        """
        agent_specs = environment_spec.get_agent_specs()
        agents = environment_spec.get_agent_ids()
        env_extras_spec = environment_spec.get_extra_specs()
        extras_spec.update(env_extras_spec)

        obs_specs = {}
        act_specs = {}
        reward_specs = {}
        step_discount_specs = {}
        for agent in agents:
            rewards_spec, step_discounts_spec = tree_utils.broadcast_structures(
                agent_specs[agent].rewards, agent_specs[agent].discounts)
            obs_specs[agent] = agent_specs[agent].observations
            act_specs[agent] = agent_specs[agent].actions
            reward_specs[agent] = rewards_spec
            step_discount_specs[agent] = step_discounts_spec

        spec_step = base.Step(
            observations=obs_specs,
            actions=act_specs,
            rewards=reward_specs,
            discounts=step_discount_specs,
            start_of_episode=specs.Array(shape=(), dtype=bool),
            extras=extras_spec,
        )
        return tree.map_structure_with_path(base.spec_like_to_tensor_spec,
                                            spec_step)
예제 #5
0
    def _write(self):
        # NOTE: we do not check that the buffer is of length N here. This means
        # that at the beginning of an episode we will add the initial N-1
        # transitions (of size 1, 2, ...) and at the end of an episode (when
        # called from write_last) we will write the final transitions of size (N,
        # N-1, ...). See the Note in the docstring.

        # Form the n-step transition given the steps.
        observation = self._buffer[0].observation
        action = self._buffer[0].action
        extras = self._buffer[0].extras
        next_observation = self._next_observation

        # Give the same tree structure to the n-step return accumulator,
        # n-step discount accumulator, and self.discount, so that they can be
        # iterated in parallel using tree.map_structure.
        (n_step_return, total_discount,
         self_discount) = tree_utils.broadcast_structures(
             self._buffer[0].reward, self._buffer[0].discount, self._discount)

        # Copy total_discount, so that accumulating into it doesn't affect
        # _buffer[0].discount.
        total_discount = tree.map_structure(np.copy, total_discount)

        # Broadcast n_step_return to have the broadcasted shape of
        # reward * discount. Also copy, to avoid accumulating into
        # _buffer[0].reward.
        n_step_return = tree.map_structure(
            lambda r, d: np.copy(np.broadcast_to(r,
                                                 np.broadcast(r, d).shape)),
            n_step_return, total_discount)

        # NOTE: total discount will have one less discount than it does
        # step.discounts. This is so that when the learner/update uses an additional
        # discount we don't apply it twice. Inside the following loop we will
        # apply this right before summing up the n_step_return.
        for step in itertools.islice(self._buffer, 1, None):
            (step_discount, step_reward,
             total_discount) = tree_utils.broadcast_structures(
                 step.discount, step.reward, total_discount)

            # Equivalent to: `total_discount *= self._discount`.
            tree.map_structure(operator.imul, total_discount, self_discount)

            # Equivalent to: `n_step_return += step.reward * total_discount`.
            tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td),
                               n_step_return, step_reward, total_discount)

            # Equivalent to: `total_discount *= step.discount`.
            tree.map_structure(operator.imul, total_discount, step_discount)

        transition = types.Transition(observation=observation,
                                      action=action,
                                      reward=n_step_return,
                                      discount=total_discount,
                                      next_observation=next_observation,
                                      extras=extras)

        # Create a list of steps.
        if self._final_step_placeholder is None:
            # utils.final_step_like is expensive (around 0.085ms) to run every time
            # so we cache its output.
            self._final_step_placeholder = utils.final_step_like(
                self._buffer[0], next_observation)
        final_step: base.Step = self._final_step_placeholder._replace(
            observation=next_observation)
        steps = list(self._buffer) + [final_step]

        # Calculate the priority for this transition.
        table_priorities = utils.calculate_priorities(self._priority_fns,
                                                      steps)

        # Insert the transition into replay along with its priority.
        self._writer.append(transition)
        for table, priority in table_priorities.items():
            self._writer.create_item(table=table,
                                     num_timesteps=1,
                                     priority=priority)