def test_dqn_n_step(self): obs = [1, 2, 3, 4, 5, 6, 7] actions = ["a", "b", "a", "a", "a", "b", "a"] rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0] new_obs = [2, 3, 4, 5, 6, 7, 8] dones = [0, 0, 0, 0, 0, 0, 1] _adjust_nstep(3, 0.9, obs, actions, rewards, new_obs, dones) self.assertEqual(obs, [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(actions, ["a", "b", "a", "a", "a", "b", "a"]) self.assertEqual(new_obs, [4, 5, 6, 7, 8, 8, 8]) self.assertEqual(dones, [0, 0, 0, 0, 1, 1, 1]) self.assertEqual(rewards, [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): # FIXME: Get done from info is required since agentwise done is not # supported now. sample_batch.data["dones"] = self.get_done_from_info( sample_batch.data["infos"]) # N-step Q adjustments if self.config["n_step"] > 1: _adjust_nstep(self.config["n_step"], self.config["gamma"], sample_batch[SampleBatch.CUR_OBS], sample_batch[SampleBatch.ACTIONS], sample_batch[SampleBatch.REWARDS], sample_batch[SampleBatch.NEXT_OBS], sample_batch[SampleBatch.DONES]) return sample_batch