def test_action_prob(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) with torch.no_grad(): actions = self.policy(states) probs = self.policy(states, action=actions) tt.assert_almost_equal(probs, torch.tensor([0.204, 0.333, 0.217]), decimal=3)
def _step(self): states = State.from_list([env.state for env in self._env]) rewards = torch.tensor([env.reward for env in self._env], dtype=torch.float, device=self._env[0].device) actions = self._agent.act(states, rewards) for i, env in enumerate(self._env): self._step_env(i, env, actions[i])
def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) dist = self.policy(states) actions = dist.sample() log_probs = dist.log_prob(actions) tt.assert_equal(actions, torch.tensor([1, 2, 1])) loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean() self.policy.reinforce(loss)
def test_list(self): model = nn.Linear(2, 2) net = nn.ListNetwork(model, (2, )) features = torch.randn((4, 2)) done = torch.tensor([1, 1, 0, 1], dtype=torch.uint8) out = net(State(features, done)) tt.assert_almost_equal( out, torch.tensor([[0.0479387, -0.2268031], [0.2346841, 0.0743403], [0., 0.], [0.2204496, 0.086818]])) features = torch.randn(3, 2) done = torch.tensor([1, 1, 1], dtype=torch.uint8) out = net(State(features, done)) tt.assert_almost_equal( out, torch.tensor([[0.4234636, 0.1039939], [0.6514298, 0.3354351], [-0.2543002, -0.2041451]]))
def test_eval(self): states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) dist = self.policy.no_grad(states) tt.assert_almost_equal(dist.probs, torch.tensor([ [0.352, 0.216, 0.432], [0.266, 0.196, 0.538], [0.469, 0.227, 0.304] ]), decimal=3) best = self.policy.eval(states) tt.assert_equal(best, torch.tensor([2, 2, 0]))
def _append_time_feature(self, state): if self.timestep is None: self.timestep = torch.zeros(len(state), device=state.features.device) features = torch.cat((state.features, self.scale * self.timestep.view( (-1, 1))), dim=1) state = State(features, state.mask, state.info) self.timestep = state.mask.float() * (self.timestep + 1) return state
def _stack(self, state): if not self._frames: self._frames = [state.raw] * self._size else: self._frames = self._frames[1:] + [state.raw] if self._lazy: return LazyState(self._frames, state.mask, state.info) return State(torch.cat(self._frames, dim=1), state.mask, state.info)
def test_rollout(self): buffer = NStepBatchBuffer(2, 3, discount_factor=0.5) actions = torch.ones((3)) states = State(torch.arange(0, 12)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 4 * torch.ones(3)) states, _, returns, next_states, lengths = buffer.sample(-1) expected_states = State(torch.arange(0, 6)) expect_next_states = State( torch.cat((torch.arange(6, 9), torch.arange(6, 9)))) expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float() expected_lengths = torch.tensor([2, 2, 2, 1, 1, 1]).long() self.assert_states_equal(states, expected_states) self.assert_states_equal(next_states, expect_next_states) tt.assert_allclose(returns, expected_returns) tt.assert_equal(lengths, expected_lengths)
def eval(self, states): with torch.no_grad(): training = self.model.training result = self.model(states.features.float()) self.model.train(training) return State( result, mask=states.mask, info=states.info )
def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): action = self.policy(state) loss = torch.abs(target - action).mean() self.policy.reinforce(-loss) self.assertTrue(loss < 1)
def test_reinforce_list(self): states = State(torch.randn(5, STATE_DIM), mask=torch.tensor([1, 1, 0, 1, 0])) result = self.v(states) tt.assert_almost_equal( result, torch.tensor([0.7053187, 0.3975691, 0., 0.2701665, 0.])) self.v.reinforce(torch.tensor([1, -1, 1, 1, 1]).float()) result = self.v(states) tt.assert_almost_equal( result, torch.tensor([0.9732854, 0.5453826, 0., 0.4344811, 0.]))
def __call__(self, states): features = self.model(states.features.float()) out = features.detach() out.requires_grad = True self._cache.append(features) self._out.append(out) return State( out, mask=states.mask, info=states.info )
def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 100): action = self.policy.greedy(state) loss = torch.abs(target - action).mean() loss.backward() self.policy.step() self.assertTrue(loss < 0.1)
def test_run(self): state1 = State(torch.randn(1, STATE_DIM)) dist1 = self.policy(state1) action1 = dist1.sample() log_prob1 = dist1.log_prob(action1) self.assertEqual(action1.item(), 0) state2 = State(torch.randn(1, STATE_DIM)) dist2 = self.policy(state2) action2 = dist2.sample() log_prob2 = dist2.log_prob(action2) self.assertEqual(action2.item(), 2) loss = -(torch.tensor([-1, 1000000]) * torch.cat((log_prob1, log_prob2))).mean() self.policy.reinforce(loss) state3 = State(torch.randn(1, STATE_DIM)) dist3 = self.policy(state3) action3 = dist3.sample() self.assertEqual(action3.item(), 2)
def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([0.25, 0.5, -0.5]) for _ in range(0, 200): action, _ = self.policy(state) loss = ((target - action) ** 2).mean() loss.backward() self.policy.step() self.assertLess(loss, 0.2)
def test_scaling(self): self.space = Box(np.array([-10, -5, 100]), np.array([10, -2, 200])) self.policy = SoftDeterministicPolicy( self.model, self.optimizer, self.space ) state = State(torch.randn(1, STATE_DIM)) action, log_prob = self.policy(state) tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]])) tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4)
def test_eval_list(self): states = State(torch.randn(5, STATE_DIM), mask=torch.tensor([1, 1, 0, 1, 0])) result = self.q.eval(states) tt.assert_almost_equal(result, torch.tensor( [[-0.238509, -0.726287, -0.034026], [-0.35688755, -0.6612102, 0.34849477], [0., 0., 0.], [0.1944, -0.5536, -0.2345], [0., 0., 0.]]), decimal=2)
def test_multi_reinforce(self): states = State(torch.randn(5, STATE_DIM), mask=torch.tensor([1, 1, 0, 1, 0, 0])) self.v(states[0:2]) self.v(states[2:4]) self.v(states[4:6]) self.v.reinforce(torch.tensor([1, 2]).float()) self.v.reinforce(torch.tensor([1, 1]).float()) self.v.reinforce(torch.tensor([1, 2]).float()) with self.assertRaises(Exception): self.v.reinforce(torch.tensor([1, 2]).float())
def _make_state(self, raw, done, info=None): if info is None: info = {"life_lost": False} elif not "life_lost" in info: info["life_lost"] = False return State( torch.from_numpy( np.moveaxis(np.array(raw, dtype=self.state_space.dtype), -1, 0)).unsqueeze(0).to(self._device), self._done_mask if done else self._not_done_mask, [info], )
def _train(self): if len(self._buffer) >= self._batch_size: states = State.from_list(self._features) _, _, returns, next_states, rollout_lengths = self._buffer.sample( self._batch_size) td_errors = (returns + (self.discount_factor**rollout_lengths) * self.v.eval(self.features.eval(next_states)) - self.v(states)) self.v.reinforce(td_errors) self.policy.reinforce(td_errors) self.features.reinforce() self._features = []
def test_reinforce_one(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy(state) action = dist.sample() log_prob1 = dist.log_prob(action) loss = -log_prob1.mean() self.policy.reinforce(loss) dist = self.policy(state) log_prob2 = dist.log_prob(action) self.assertGreater(log_prob2.item(), log_prob1.item())
def test_run(self): states = State(torch.arange(0, 20)) actions = torch.arange(0, 20).view((-1, 1)) rewards = torch.arange(0, 20) expected_samples = State( torch.tensor([ [0, 1, 2], [0, 1, 3], [5, 5, 5], [6, 6, 2], [7, 7, 7], [7, 8, 8], [7, 7, 7], ])) expected_weights = [ [1.0000, 1.0000, 1.0000], [0.5659, 0.7036, 0.5124], [0.0631, 0.0631, 0.0631], [0.0631, 0.0631, 0.1231], [0.0631, 0.0631, 0.0631], [0.0776, 0.0631, 0.0631], [0.0866, 0.0866, 0.0866], ] actual_samples = [] actual_weights = [] for i in range(10): self.replay_buffer.store(states[i], actions[i], rewards[i], states[i + 1]) if i > 2: sample = self.replay_buffer.sample(3) sample_states = sample[0].features self.replay_buffer.update_priorities(torch.randn(3)) actual_samples.append(sample_states) actual_weights.append(sample[-1]) actual_samples = State(torch.cat(actual_samples).view((-1, 3))) self.assert_states_equal(actual_samples, expected_samples) np.testing.assert_array_almost_equal(expected_weights, np.vstack(actual_weights), decimal=3)
def test_parallel(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5) actions = torch.ones((2)) states = [ State(torch.tensor([[0], [3]])), State(torch.tensor([[1], [4]])), State(torch.tensor([[2], [5]])), ] rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(State.from_list(states))).view( 3, -1) tt.assert_almost_equal(values, torch.tensor([[0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468]]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([[0.6436, 1.439], [1.909, 1.704]]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([[1.121, 1.865], [1.909, 1.704]]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1))
def test_backward(self): states = self.features(self.states) loss = torch.tensor(0) loss = torch.sum(states.features) loss.backward() self.features.reinforce() features = self.features(self.states) expected = State( torch.tensor([[-0.71, -1.2, -0.5], [-0.72, -1.03, -0.02], [-0.57, -1.3, -1.01]]), mask=torch.tensor([1, 0, 1]), ) self.assert_state_equal(features, expected)
def sample(self, batch_size): if batch_size > len(self): raise Exception("Not enough states for batch size!") states = self._states[0:batch_size] actions = self._actions[0:batch_size] actions = torch.tensor(actions, device=actions[0].device) next_states = self._next_states[0:batch_size] rewards = self._rewards[0:batch_size] rewards = torch.tensor(rewards, device=rewards[0].device, dtype=torch.float) lengths = self._lengths[0:batch_size] lengths = torch.tensor(lengths, device=rewards[0].device, dtype=torch.float) self._states = self._states[batch_size:] self._actions = self._actions[batch_size:] self._next_states = self._next_states[batch_size:] self._rewards = self._rewards[batch_size:] self._lengths = self._lengths[batch_size:] states = State.from_list(states) next_states = State.from_list(next_states) return states, actions, rewards, next_states, lengths
def test_eval(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy.no_grad(state) tt.assert_almost_equal(dist.mean, torch.tensor([[-0.229, 0.43, -0.058]]), decimal=3) tt.assert_almost_equal(dist.entropy(), torch.tensor([4.251]), decimal=3) best = self.policy.eval(state) tt.assert_almost_equal(best, torch.tensor([[-0.229, 0.43, -0.058]]), decimal=3)
def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): dist = self.policy(state) action = dist.sample() log_prob = dist.log_prob(action) error = ((target - action)**2).mean() loss = (error * log_prob).mean() self.policy.reinforce(loss) self.assertTrue(error < 1)
def act(self, state, reward): if not self._frames: self._frames = [state.raw] * self._size else: self._frames = self._frames[1:] + [state.raw] if self._lazy: state = LazyState(self._frames, state.mask, state.info) else: state = State(torch.cat(self._frames, dim=1), state.mask, state.info) return self.agent.act(state, reward)
def _summarize_transitions(self): sample_n = self.n_envs * self.n_steps sample_states = [None] * sample_n sample_actions = [None] * sample_n sample_next_states = [None] * sample_n for e in range(self.n_envs): next_state = self._states[self.n_steps][e] for i in range(self.n_steps): t = self.n_steps - 1 - i idx = t * self.n_envs + e state = self._states[t][e] action = self._actions[t][e] sample_states[idx] = state sample_actions[idx] = action sample_next_states[idx] = next_state if not state.mask: next_state = state return (State.from_list(sample_states), torch.stack(sample_actions), State.from_list(sample_next_states))
def test_run(self): states = torch.arange(0, 20) actions = torch.arange(0, 20) rewards = torch.arange(0, 20) expected_samples = torch.tensor([[0, 0, 0], [1, 1, 0], [0, 1, 1], [3, 0, 0], [1, 4, 4], [1, 2, 4], [2, 4, 3], [4, 7, 4], [7, 4, 6], [6, 5, 6]]) expected_weights = np.ones((10, 3)) actual_samples = [] actual_weights = [] for i in range(10): state = State(states[i].unsqueeze(0), torch.tensor([1])) next_state = State(states[i + 1].unsqueeze(0), torch.tensor([1])) self.replay_buffer.store(state, actions[i], rewards[i], next_state) sample = self.replay_buffer.sample(3) actual_samples.append(sample[0].features) actual_weights.append(sample[-1]) tt.assert_equal( torch.cat(actual_samples).view(expected_samples.shape), expected_samples) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights))