def test_run(self): state1 = State(torch.randn(1, STATE_DIM)) dist1 = self.policy(state1) action1 = dist1.sample() log_prob1 = dist1.log_prob(action1) self.assertEqual(action1.item(), 0) state2 = State(torch.randn(1, STATE_DIM)) dist2 = self.policy(state2) action2 = dist2.sample() log_prob2 = dist2.log_prob(action2) self.assertEqual(action2.item(), 2) loss = -(torch.tensor([-1, 1000000]) * torch.cat( (log_prob1, log_prob2))).mean() self.policy.reinforce(loss) state3 = State(torch.randn(1, STATE_DIM)) dist3 = self.policy(state3) action3 = dist3.sample() self.assertEqual(action3.item(), 2)
def test_from_dict(self): observation = torch.randn(3, 4) state = State({ 'observation': observation, 'done': True, 'mask': 1, 'reward': 5. }) tt.assert_equal(state.observation, observation) self.assertEqual(state.done, True) self.assertEqual(state.mask, 1.) self.assertEqual(state.reward, 5.)
def test_reinforce_one(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy(state) action = dist.sample() log_prob1 = dist.log_prob(action) loss = -log_prob1.mean() self.policy.reinforce(loss) dist = self.policy(state) log_prob2 = dist.log_prob(action) self.assertGreater(log_prob2.item(), log_prob1.item())
def _reset(self): state = State.from_gym(self._reset_orig(), dtype=self.observation_space.dtype, device="cuda") obs = state.observation.cpu().numpy() r = state.reward done = state.done info = {} pilot_action = onehot_encode(self.pilot_policy(state)) obs = np.concatenate((obs, pilot_action)) return obs
def test_parallel(self): buffer = GeneralizedAdvantageBuffer( self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5 ) actions = torch.ones((2)) def make_states(x, y): return State.array([ State({'observation': torch.tensor([float(x)])}), State({'observation': torch.tensor([float(y)])}) ]) states = State.array([ make_states(0, 3), make_states(1, 4), make_states(2, 5), ]) self.assertEqual(states.shape, (3, 2)) rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)).view(3, -1) tt.assert_almost_equal(values, torch.tensor([ [0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468] ]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([ [0.6436, 1.439], [1.909, 1.704] ]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([ [1.121, 1.865], [1.909, 1.704] ]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1))
def test_run(self): states = StateArray(torch.arange(0, 20), (20, ), reward=torch.arange(-1, 19).float()) actions = torch.arange(0, 20).view((-1, 1)) expected_samples = State( torch.tensor([ [0, 1, 2], [0, 1, 3], [5, 5, 5], [6, 6, 2], [7, 7, 7], [7, 8, 8], [7, 7, 7], ])) expected_weights = [ [1.0000, 1.0000, 1.0000], [0.5659, 0.7036, 0.5124], [0.0631, 0.0631, 0.0631], [0.0631, 0.0631, 0.1231], [0.0631, 0.0631, 0.0631], [0.0776, 0.0631, 0.0631], [0.0866, 0.0866, 0.0866], ] actual_samples = [] actual_weights = [] for i in range(10): self.replay_buffer.store(states[i], actions[i], states[i + 1]) if i > 2: sample = self.replay_buffer.sample(3) sample_states = sample[0].observation self.replay_buffer.update_priorities(torch.randn(3)) actual_samples.append(sample_states) actual_weights.append(sample[-1]) actual_samples = State(torch.cat(actual_samples).view((-1, 3))) self.assert_states_equal(actual_samples, expected_samples) np.testing.assert_array_almost_equal(expected_weights, np.vstack(actual_weights), decimal=3)
def _step(self, action): state = State.from_gym(self._step_orig(_convert(disc_to_cont(action))), dtype=self.observation_space.dtype, device="cuda") obs = state.observation.cpu().numpy() r = state.reward done = state.done info = {} pilot_action = onehot_encode(self.pilot_policy(state)) obs = np.concatenate((obs, pilot_action)) return obs, r, done, info
def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): dist = self.policy(state) action = dist.sample() log_prob = dist.log_prob(action) error = ((target - action)**2).mean() loss = (error * log_prob).mean() self.policy.reinforce(loss) self.assertTrue(error < 1)
def step(self, action): observation, reward, done, info = self._env.last() if self._env.dones[self._env.agent_selection]: action = None if torch.is_tensor(action): self._env.step(action.item()) else: self._env.step(action) observation, reward, done, info = self._env.last() return State.from_gym((observation.reshape( (1, 84, 84)), reward, done, info), device=self.device, dtype=np.uint8)
def test_eval(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy.no_grad(state) tt.assert_almost_equal(dist.mean, torch.tensor([[-0.233, 0.459, -0.058]]), decimal=3) tt.assert_almost_equal(dist.entropy(), torch.tensor([4.251]), decimal=3) best = self.policy.eval(state) tt.assert_almost_equal(best, torch.tensor([[-0.233, 0.459, -0.058]]), decimal=3)
def _summarize_transitions(self): sample_n = self.n_envs * self.n_steps sample_states = [None] * sample_n sample_actions = [None] * sample_n sample_next_states = [None] * sample_n for e in range(self.n_envs): next_state = self._states[self.n_steps][e] for i in range(self.n_steps): t = self.n_steps - 1 - i idx = t * self.n_envs + e state = self._states[t][e] action = self._actions[t][e] sample_states[idx] = state sample_actions[idx] = action sample_next_states[idx] = next_state if not state.mask: next_state = state return (State.array(sample_states), torch.stack(sample_actions), State.array(sample_next_states))
def test_run(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5) states = torch.arange(0, 20) actions = torch.arange(0, 20).view((-1, 1)) rewards = torch.arange(0, 20) expected_samples = torch.tensor([ [0, 0, 0], [1, 1, 0], [0, 1, 1], [3, 0, 0], [1, 4, 4], [1, 2, 4], [2, 4, 3], [4, 7, 4], [7, 4, 6], [6, 5, 6], ]) expected_weights = np.ones((10, 3)) actual_samples = [] actual_weights = [] for i in range(10): state = State(states[i]) next_state = State(states[i + 1], reward=rewards[i]) self.replay_buffer.store(state, actions[i], next_state) sample = self.replay_buffer.sample(3) actual_samples.append(sample[0].observation) actual_weights.append(sample[-1]) tt.assert_equal( torch.cat(actual_samples).view(expected_samples.shape), expected_samples) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights))
def advantages(self, states): if len(self) < self._batch_size: raise Exception("Not enough states received!") self._states.append(states) states = State.array(self._states[0:self.n_steps + 1]) actions = torch.cat(self._actions[:self.n_steps], dim=0) rewards = torch.stack(self._rewards[:self.n_steps]) _values = self.v.target(self.features.target(states)) values = _values[0:self.n_steps] next_values = _values[1:] td_errors = rewards + self.gamma * next_values - values advantages = self._compute_advantages(td_errors) self._clear_buffers() return (states[0:-1].flatten(), actions, advantages.view(-1))
def test_reinforce(self): def loss(log_probs): return -log_probs.mean() states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) actions = self.policy.no_grad(states).sample() # notice the values increase with each successive reinforce log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
def _terminal(self, state, reward): self._rewards.append(reward) features = State.array(self._features) rewards = torch.tensor(self._rewards, device=features.device) log_pis = torch.stack(self._log_pis) self._trajectories.append((features, rewards, log_pis)) self._current_batch_size += len(features) self._features = [] self._rewards = [] self._log_pis = [] if self._current_batch_size >= self.min_batch_size: self._train() # have to return something return self.policy.no_grad(self.features.no_grad(state)).sample()
def test_single_env(self): state = State(torch.randn(4)) self.agent.act(state) tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor( [0.3923, -0.2236, -0.3195, -1.2050, 0.]), atol=1e-04) self.agent.act(state) tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor( [0.3923, -0.2236, -0.3195, -1.2050, 1e-3]), atol=1e-04) self.agent.act(state) tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor( [0.3923, -0.2236, -0.3195, -1.2050, 2e-3]), atol=1e-04)
def reset(self) -> None: """ Reset the environment and return a new intial state. Returns ------- State The initial state for the next episode. """ # environment "state" self._state = State({ 'observation': torch.tensor([self._start_state]), # pylint: disable=not-callable 'reward': 0, 'done': False }) self._reward = 0 self._action = None self._timestep = 0 self._done = False
def test_target_net(self): torch.manual_seed(2) model = nn.Sequential(nn.Linear(1, 1)) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) q = QNetwork(model, optimizer, target=FixedTarget(3)) inputs = State(torch.tensor([1.])) def loss(policy_value): target = policy_value - 1 return smooth_l1_loss(policy_value, target.detach()) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.008584141731262207) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.20858412981033325) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.4085841178894043) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.6085841655731201) np.testing.assert_equal(target_value, -0.6085841655731201) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201)
def test_auto_mask_true(self): observation = torch.randn(3, 4) state = State({'observation': observation, 'done': True, 'reward': 5.}) self.assertEqual(state.mask, 0.)
def reset(self): self._state = State.array([sub_env.reset() for sub_env in self._envs]) return self._state
def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states).sample() tt.assert_equal(actions, torch.tensor([2, 2, 0]))
def test_step_one(self): state = State(torch.randn(1, STATE_DIM)) self.policy(state) self.policy.step()
def _aggregate_states(self): return State.array([env.state for env in self._envs])
def test_forward_state(self): inputs = State({ 'observation': torch.tensor([1, 2, 3]) }) outputs = self.model(inputs) self.assertEqual(inputs, outputs)
def reset(self): state = self._env.reset(), 0., False, None self._state = State.from_gym(state, dtype=self._env.observation_space.dtype, device=self._device) return self._state
def step(self, action): self._state = State.from_gym(self._env.step(self._convert(action)), dtype=self._env.observation_space.dtype, device=self._device) return self._state
def test_apply_mask(self): observation = torch.randn(3, 4) state = State.from_gym((observation, 0., True, {})) tt.assert_equal(state.apply_mask(observation), torch.zeros(3, 4))
def test_as_output(self): observation = torch.randn(3, 4) state = State(observation) tensor = torch.randn(1, 5, 3) self.assertEqual(state.as_output(tensor).shape, (5, 3))
def test_as_input(self): observation = torch.randn(3, 4) state = State(observation) self.assertEqual(state.as_input('observation').shape, (1, 3, 4))
def test_to_device(self): observation = torch.randn(3, 4) state = State(observation, device=torch.device('cpu')) state_cpu = state.to("cpu") self.assertTrue(torch.equal(state['observation'], state_cpu['observation'])) self.assertFalse(state is state_cpu)