Beispiel #1
0
 def test_dqn_n_step(self):
     obs = [1, 2, 3, 4, 5, 6, 7]
     actions = ["a", "b", "a", "a", "a", "b", "a"]
     rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0]
     new_obs = [2, 3, 4, 5, 6, 7, 8]
     dones = [0, 0, 0, 0, 0, 0, 1]
     _adjust_nstep(3, 0.9, obs, actions, rewards, new_obs, dones)
     self.assertEqual(obs, [1, 2, 3, 4, 5, 6, 7])
     self.assertEqual(actions, ["a", "b", "a", "a", "a", "b", "a"])
     self.assertEqual(new_obs, [4, 5, 6, 7, 8, 8, 8])
     self.assertEqual(dones, [0, 0, 0, 0, 1, 1, 1])
     self.assertEqual(rewards,
                      [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
Beispiel #2
0
    def postprocess_trajectory(self,
                               sample_batch,
                               other_agent_batches=None,
                               episode=None):
        # FIXME: Get done from info is required since agentwise done is not
        # supported now.
        sample_batch.data["dones"] = self.get_done_from_info(
            sample_batch.data["infos"])

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            _adjust_nstep(self.config["n_step"], self.config["gamma"],
                          sample_batch[SampleBatch.CUR_OBS],
                          sample_batch[SampleBatch.ACTIONS],
                          sample_batch[SampleBatch.REWARDS],
                          sample_batch[SampleBatch.NEXT_OBS],
                          sample_batch[SampleBatch.DONES])

        return sample_batch