def _reshape(self, minibatch, weights):
     states = State.array([sample[0] for sample in minibatch])
     if torch.is_tensor(minibatch[0][1]):
         actions = torch.stack([sample[1] for sample in minibatch])
     else:
         actions = torch.tensor([sample[1] for sample in minibatch],
                                device=self.device)
     next_states = State.array([sample[2] for sample in minibatch])
     return (states, actions, next_states.reward, next_states, weights)
    def test_simple(self):
        buffer = GeneralizedAdvantageBuffer(
            self.v,
            self.features,
            2,
            1,
            discount_factor=0.5,
            lam=0.5
        )
        actions = torch.ones((1))
        states = State.array([State({'observation': torch.tensor([float(x)])}) for x in range(3)])
        rewards = torch.tensor([1., 2, 4])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(states))
        tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3)

        td_errors = torch.zeros(2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors, torch.tensor([0.6436, 1.909]), decimal=3)

        advantages = torch.zeros(2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages)
        tt.assert_equal(_actions, torch.tensor([1, 1]))
 def step(self, actions):
     states = []
     actions = actions.cpu().detach().numpy()
     for sub_env, action in zip(self._envs, actions):
         state = sub_env.reset() if sub_env.state.done else sub_env.step(
             action)
         states.append(state)
     self._state = State.array(states)
     return self._state
예제 #4
0
 def forward(self, states, actions=None):
     x = self.fc(states.observation)
     x = x.view((-1, 64, 7, 7))
     x = self.deconv(x)
     x = x.view((-1, self.num_actions, FRAMES, 84, 84))
     if actions is not None:
         x = x[torch.arange(len(x)), actions]
         return states.update('observation', states.as_output(x))
     x = states.as_output(x)
     return State.array([states.update('observation', _x) for _x in x])
    def test_parallel(self):
        buffer = GeneralizedAdvantageBuffer(
            self.v,
            self.features,
            2,
            2,
            discount_factor=0.5,
            lam=0.5
        )
        actions = torch.ones((2))

        def make_states(x, y):
            return State.array([
                State({'observation': torch.tensor([float(x)])}),
                State({'observation': torch.tensor([float(y)])})
            ])

        states = State.array([
            make_states(0, 3),
            make_states(1, 4),
            make_states(2, 5),
        ])
        self.assertEqual(states.shape, (3, 2))
        rewards = torch.tensor([[1., 1], [2, 1], [4, 1]])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(states)).view(3, -1)
        tt.assert_almost_equal(values, torch.tensor([
            [0.183, -1.408],
            [-0.348, -1.938],
            [-0.878, -2.468]
        ]), decimal=3)

        td_errors = torch.zeros(2, 2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors, torch.tensor([
            [0.6436, 1.439],
            [1.909, 1.704]
        ]), decimal=3)

        advantages = torch.zeros(2, 2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages, torch.tensor([
            [1.121, 1.865],
            [1.909, 1.704]
        ]), decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages.view(-1))
    def _summarize_transitions(self):
        sample_n = self.n_envs * self.n_steps
        sample_states = [None] * sample_n
        sample_actions = [None] * sample_n
        sample_next_states = [None] * sample_n

        for e in range(self.n_envs):
            next_state = self._states[self.n_steps][e]
            for i in range(self.n_steps):
                t = self.n_steps - 1 - i
                idx = t * self.n_envs + e
                state = self._states[t][e]
                action = self._actions[t][e]

                sample_states[idx] = state
                sample_actions[idx] = action
                sample_next_states[idx] = next_state

                if not state.mask:
                    next_state = state

        return (State.array(sample_states), torch.stack(sample_actions),
                State.array(sample_next_states))
    def advantages(self, states):
        if len(self) < self._batch_size:
            raise Exception("Not enough states received!")

        self._states.append(states)
        states = State.array(self._states[0:self.n_steps + 1])
        actions = torch.cat(self._actions[:self.n_steps], dim=0)
        rewards = torch.stack(self._rewards[:self.n_steps])
        _values = self.v.target(self.features.target(states))
        values = _values[0:self.n_steps]
        next_values = _values[1:]
        td_errors = rewards + self.gamma * next_values - values
        advantages = self._compute_advantages(td_errors)
        self._clear_buffers()
        return (states[0:-1].flatten(), actions, advantages.view(-1))
예제 #8
0
    def _terminal(self, state, reward):
        self._rewards.append(reward)
        features = State.array(self._features)
        rewards = torch.tensor(self._rewards, device=features.device)
        log_pis = torch.stack(self._log_pis)
        self._trajectories.append((features, rewards, log_pis))
        self._current_batch_size += len(features)
        self._features = []
        self._rewards = []
        self._log_pis = []

        if self._current_batch_size >= self.min_batch_size:
            self._train()

        # have to return something
        return self.policy.no_grad(self.features.no_grad(state)).sample()
 def _aggregate_states(self):
     return State.array([env.state for env in self._envs])
 def reset(self):
     self._state = State.array([sub_env.reset() for sub_env in self._envs])
     return self._state
 def make_states(x, y):
     return State.array([
         State({'observation': torch.tensor([float(x)])}),
         State({'observation': torch.tensor([float(y)])})
     ])