Ejemplo n.º 1
0
def postprocess_nstep_and_prio(policy: Policy,
                               batch: SampleBatch,
                               other_agent=None,
                               episode=None) -> SampleBatch:
    # N-step Q adjustments.
    if policy.config["n_step"] > 1:
        adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch)

    # Create dummy prio-weights (1.0) in case we don't have any in
    # the batch.
    if PRIO_WEIGHTS not in batch:
        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])

    # Prioritize on the worker side.
    if batch.count > 0 and policy.config["worker_side_prioritization"]:
        td_errors = policy.compute_td_error(batch[SampleBatch.OBS],
                                            batch[SampleBatch.ACTIONS],
                                            batch[SampleBatch.REWARDS],
                                            batch[SampleBatch.NEXT_OBS],
                                            batch[SampleBatch.DONES],
                                            batch[PRIO_WEIGHTS])
        new_priorities = (np.abs(convert_to_numpy(td_errors)) +
                          policy.config["prioritized_replay_eps"])
        batch[PRIO_WEIGHTS] = new_priorities

    return batch
Ejemplo n.º 2
0
def postprocess_nstep_and_prio(
    policy: Policy, batch: SampleBatch, other_agent=None, episode=None
) -> SampleBatch:
    # N-step Q adjustments.
    if policy.config["n_step"] > 1:
        adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch)

    # Create dummy prio-weights (1.0) in case we don't have any in
    # the batch.
    if PRIO_WEIGHTS not in batch:
        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])

    # Prioritize on the worker side.
    if batch.count > 0 and policy.config["replay_buffer_config"].get(
        "worker_side_prioritization", False
    ):
        td_errors = policy.compute_td_error(
            batch[SampleBatch.OBS],
            batch[SampleBatch.ACTIONS],
            batch[SampleBatch.REWARDS],
            batch[SampleBatch.NEXT_OBS],
            batch[SampleBatch.DONES],
            batch[PRIO_WEIGHTS],
        )
        # Retain compatibility with old-style Replay args
        epsilon = policy.config.get("replay_buffer_config", {}).get(
            "prioritized_replay_eps"
        ) or policy.config.get("prioritized_replay_eps")
        if epsilon is None:
            raise ValueError("prioritized_replay_eps not defined in config.")

        new_priorities = np.abs(convert_to_numpy(td_errors)) + epsilon
        batch[PRIO_WEIGHTS] = new_priorities

    return batch
Ejemplo n.º 3
0
 def test_n_step_3(self):
     """Tests, whether n-step adjustments of trajectories work."""
     # n-step = 3
     gamma = 0.9
     obs = [1, 2, 3, 4, 5, 6, 7]
     actions = ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"]
     rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0]
     dones = [0, 0, 0, 0, 0, 0, 1]
     next_obs = [2, 3, 4, 5, 6, 7, 8]
     batch = SampleBatch(
         {
             SampleBatch.OBS: obs,
             SampleBatch.ACTIONS: actions,
             SampleBatch.REWARDS: rewards,
             SampleBatch.DONES: dones,
             SampleBatch.NEXT_OBS: next_obs,
         }
     )
     adjust_nstep(3, gamma, batch)
     check(batch[SampleBatch.OBS], [1, 2, 3, 4, 5, 6, 7])
     check(
         batch[SampleBatch.ACTIONS],
         ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"],
     )
     check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 8, 8, 8])
     check(batch[SampleBatch.DONES], [0, 0, 0, 0, 1, 1, 1])
     check(
         batch[SampleBatch.REWARDS], [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0]
     )
Ejemplo n.º 4
0
 def test_n_step_4(self):
     """Tests, whether n-step adjustments of trajectories work."""
     # n-step = 4
     gamma = 0.99
     obs = np.arange(0, 7)
     actions = np.random.randint(-1, 3, size=(7, ))
     check_actions = actions.copy()
     rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
     dones = [False, False, False, False, False, False, True]
     next_obs = np.arange(1, 8)
     batch = SampleBatch({
         SampleBatch.OBS: obs,
         SampleBatch.ACTIONS: actions,
         SampleBatch.REWARDS: rewards,
         SampleBatch.DONES: dones,
         SampleBatch.NEXT_OBS: next_obs,
     })
     adjust_nstep(4, gamma, batch)
     check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
     check(batch[SampleBatch.ACTIONS], check_actions)
     check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
     check(batch[SampleBatch.DONES],
           [False, False, False, True, True, True, True])
     check(batch[SampleBatch.REWARDS], [
         discount_cumsum(np.array(rewards[0:4]), gamma)[0],
         discount_cumsum(np.array(rewards[1:5]), gamma)[0],
         discount_cumsum(np.array(rewards[2:6]), gamma)[0],
         discount_cumsum(np.array(rewards[3:7]), gamma)[0],
         discount_cumsum(np.array(rewards[4:]), gamma)[0],
         discount_cumsum(np.array(rewards[5:]), gamma)[0],
         discount_cumsum(np.array(rewards[6:]), gamma)[0],
     ])
Ejemplo n.º 5
0
    def postprocess_trajectory(
        self, sample_batch, other_agent_batches=None, episode=None
    ):
        # FIXME: Get done from info is required since agentwise done is not
        #  supported now.
        sample_batch[SampleBatch.DONES] = self.get_done_from_info(
            sample_batch[SampleBatch.INFOS]
        )

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            adjust_nstep(self.config["n_step"], self.config["gamma"], sample_batch)

        return sample_batch
Ejemplo n.º 6
0
    def test_n_step_from_same_obs_source_array(self):
        """Tests, whether n-step also works on a shared obs/new-obs array."""
        gamma = 0.99
        # The underlying observation data. Both obs and next_obs will
        # be references into that same np.array.
        underlying_obs = np.arange(0, 8)
        obs = underlying_obs[:7]
        next_obs = underlying_obs[1:]

        actions = np.random.randint(-1, 3, size=(7,))
        check_actions = actions.copy()
        rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
        dones = [False, False, False, False, False, False, True]

        batch = SampleBatch(
            {
                SampleBatch.OBS: obs,
                SampleBatch.ACTIONS: actions,
                SampleBatch.REWARDS: rewards,
                SampleBatch.DONES: dones,
                SampleBatch.NEXT_OBS: next_obs,
            }
        )
        adjust_nstep(4, gamma, batch)

        check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
        check(batch[SampleBatch.ACTIONS], check_actions)
        check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
        check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True])
        check(
            batch[SampleBatch.REWARDS],
            [
                discount_cumsum(np.array(rewards[0:4]), gamma)[0],
                discount_cumsum(np.array(rewards[1:5]), gamma)[0],
                discount_cumsum(np.array(rewards[2:6]), gamma)[0],
                discount_cumsum(np.array(rewards[3:7]), gamma)[0],
                discount_cumsum(np.array(rewards[4:]), gamma)[0],
                discount_cumsum(np.array(rewards[5:]), gamma)[0],
                discount_cumsum(np.array(rewards[6:]), gamma)[0],
            ],
        )
Ejemplo n.º 7
0
 def test_n_step_very_short_trajectory(self):
     """Tests, whether n-step also works for very small trajectories."""
     gamma = 1.0
     obs = np.arange(0, 2)
     actions = np.random.randint(-100, 300, size=(2, ))
     check_actions = actions.copy()
     rewards = [10.0, 100.0]
     next_obs = np.arange(1, 3)
     batch = SampleBatch({
         SampleBatch.OBS: obs,
         SampleBatch.ACTIONS: actions,
         SampleBatch.REWARDS: rewards,
         SampleBatch.DONES: [False, False],
         SampleBatch.NEXT_OBS: next_obs,
     })
     adjust_nstep(3, gamma, batch)
     check(batch[SampleBatch.OBS], [0, 1])
     check(batch[SampleBatch.ACTIONS], check_actions)
     check(batch[SampleBatch.DONES], [False, False])
     check(batch[SampleBatch.REWARDS], [10.0 + gamma * 100.0, 100.0])
     check(batch[SampleBatch.NEXT_OBS], [2, 2])
Ejemplo n.º 8
0
 def test_n_step_malformed_dones(self):
     # Test bad input (trajectory has dones in middle).
     # Re-use same batch, but change dones.
     gamma = 1.0
     obs = np.arange(0, 7)
     actions = np.random.randint(-1, 3, size=(7, ))
     rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
     next_obs = np.arange(1, 8)
     batch = SampleBatch({
         SampleBatch.OBS:
         obs,
         SampleBatch.ACTIONS:
         actions,
         SampleBatch.REWARDS:
         rewards,
         SampleBatch.DONES: [False, False, True, False, False, False, True],
         SampleBatch.NEXT_OBS:
         next_obs,
     })
     self.assertRaisesRegex(AssertionError, "Unexpected done in middle",
                            lambda: adjust_nstep(5, gamma, batch))