class GeneralizedAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_simple(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 1, discount_factor=0.5, lam=0.5) actions = torch.ones((1)) states = State(torch.arange(0, 3).unsqueeze(1)) rewards = torch.tensor([1., 2, 4]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)) tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3) td_errors = torch.zeros(2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([0.6436, 1.909]), decimal=3) advantages = torch.zeros(2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages) tt.assert_equal(_actions, torch.tensor([1, 1])) def test_parallel(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5) actions = torch.ones((2)) states = [ State(torch.tensor([[0], [3]])), State(torch.tensor([[1], [4]])), State(torch.tensor([[2], [5]])), ] rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(State.from_list(states))).view( 3, -1) tt.assert_almost_equal(values, torch.tensor([[0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468]]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([[0.6436, 1.439], [1.909, 1.704]]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([[1.121, 1.865], [1.909, 1.704]]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1)) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
class NStepAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 3, discount_factor=0.5) actions = torch.ones((3)) states = State(torch.arange(0, 12).unsqueeze(1)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) states, _, advantages = buffer.advantages(states[6:9]) expected_states = State(torch.arange(0, 6).unsqueeze(1)) expected_next_states = State( torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1)) expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float() expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_rollout_with_nones(self): buffer = NStepAdvantageBuffer(self.v, self.features, 3, 3, discount_factor=0.5) done = torch.ones(12) done[5] = 0 done[7] = 0 done[9] = 0 states = State(torch.arange(0, 12).unsqueeze(1), done) actions = torch.ones((3)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 2 * torch.ones(3)) states, actions, advantages = buffer.advantages(states[9:12]) expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9]) expected_next_done = torch.zeros(9) expected_next_done[5] = 1 expected_next_done[7] = 1 expected_next_done[8] = 1 expected_next_states = State( torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1), expected_next_done) expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float() expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float() self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_multi_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5) raw_states = State(torch.arange(0, 12).unsqueeze(1)) actions = torch.ones((2)) buffer.store(raw_states[0:2], actions, torch.ones(2)) buffer.store(raw_states[2:4], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[4:6]) expected_states = State(torch.arange(0, 4).unsqueeze(1)) expected_returns = torch.tensor([1.5, 1.5, 1, 1]) expected_next_states = State(torch.tensor([4, 5, 4, 5]).unsqueeze(1)) expected_lengths = torch.tensor([2., 2, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) buffer.store(raw_states[4:6], actions, torch.ones(2)) buffer.store(raw_states[6:8], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[8:10]) expected_states = State(torch.arange(4, 8).unsqueeze(1)) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages( expected_states, torch.tensor([1.5, 1.5, 1, 1]), State(torch.tensor([8, 9, 8, 9]).unsqueeze(1)), torch.tensor([2., 2, 1, 1]))) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)