def _reshape(self, minibatch, weights): states = State.array([sample[0] for sample in minibatch]) if torch.is_tensor(minibatch[0][1]): actions = torch.stack([sample[1] for sample in minibatch]) else: actions = torch.tensor([sample[1] for sample in minibatch], device=self.device) next_states = State.array([sample[2] for sample in minibatch]) return (states, actions, next_states.reward, next_states, weights)
def test_simple(self): buffer = GeneralizedAdvantageBuffer( self.v, self.features, 2, 1, discount_factor=0.5, lam=0.5 ) actions = torch.ones((1)) states = State.array([State({'observation': torch.tensor([float(x)])}) for x in range(3)]) rewards = torch.tensor([1., 2, 4]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)) tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3) td_errors = torch.zeros(2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([0.6436, 1.909]), decimal=3) advantages = torch.zeros(2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages) tt.assert_equal(_actions, torch.tensor([1, 1]))
def step(self, actions): states = [] actions = actions.cpu().detach().numpy() for sub_env, action in zip(self._envs, actions): state = sub_env.reset() if sub_env.state.done else sub_env.step( action) states.append(state) self._state = State.array(states) return self._state
def forward(self, states, actions=None): x = self.fc(states.observation) x = x.view((-1, 64, 7, 7)) x = self.deconv(x) x = x.view((-1, self.num_actions, FRAMES, 84, 84)) if actions is not None: x = x[torch.arange(len(x)), actions] return states.update('observation', states.as_output(x)) x = states.as_output(x) return State.array([states.update('observation', _x) for _x in x])
def test_parallel(self): buffer = GeneralizedAdvantageBuffer( self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5 ) actions = torch.ones((2)) def make_states(x, y): return State.array([ State({'observation': torch.tensor([float(x)])}), State({'observation': torch.tensor([float(y)])}) ]) states = State.array([ make_states(0, 3), make_states(1, 4), make_states(2, 5), ]) self.assertEqual(states.shape, (3, 2)) rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)).view(3, -1) tt.assert_almost_equal(values, torch.tensor([ [0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468] ]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([ [0.6436, 1.439], [1.909, 1.704] ]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([ [1.121, 1.865], [1.909, 1.704] ]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1))
def _summarize_transitions(self): sample_n = self.n_envs * self.n_steps sample_states = [None] * sample_n sample_actions = [None] * sample_n sample_next_states = [None] * sample_n for e in range(self.n_envs): next_state = self._states[self.n_steps][e] for i in range(self.n_steps): t = self.n_steps - 1 - i idx = t * self.n_envs + e state = self._states[t][e] action = self._actions[t][e] sample_states[idx] = state sample_actions[idx] = action sample_next_states[idx] = next_state if not state.mask: next_state = state return (State.array(sample_states), torch.stack(sample_actions), State.array(sample_next_states))
def advantages(self, states): if len(self) < self._batch_size: raise Exception("Not enough states received!") self._states.append(states) states = State.array(self._states[0:self.n_steps + 1]) actions = torch.cat(self._actions[:self.n_steps], dim=0) rewards = torch.stack(self._rewards[:self.n_steps]) _values = self.v.target(self.features.target(states)) values = _values[0:self.n_steps] next_values = _values[1:] td_errors = rewards + self.gamma * next_values - values advantages = self._compute_advantages(td_errors) self._clear_buffers() return (states[0:-1].flatten(), actions, advantages.view(-1))
def _terminal(self, state, reward): self._rewards.append(reward) features = State.array(self._features) rewards = torch.tensor(self._rewards, device=features.device) log_pis = torch.stack(self._log_pis) self._trajectories.append((features, rewards, log_pis)) self._current_batch_size += len(features) self._features = [] self._rewards = [] self._log_pis = [] if self._current_batch_size >= self.min_batch_size: self._train() # have to return something return self.policy.no_grad(self.features.no_grad(state)).sample()
def _aggregate_states(self): return State.array([env.state for env in self._envs])
def reset(self): self._state = State.array([sub_env.reset() for sub_env in self._envs]) return self._state
def make_states(x, y): return State.array([ State({'observation': torch.tensor([float(x)])}), State({'observation': torch.tensor([float(y)])}) ])