def test_action_prob(self):
     torch.manual_seed(1)
     states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
     with torch.no_grad():
         actions = self.policy(states)
     probs = self.policy(states, action=actions)
     tt.assert_almost_equal(probs,
                            torch.tensor([0.204, 0.333, 0.217]),
                            decimal=3)
예제 #2
0
    def _step(self):
        states = State.from_list([env.state for env in self._env])
        rewards = torch.tensor([env.reward for env in self._env],
                               dtype=torch.float,
                               device=self._env[0].device)
        actions = self._agent.act(states, rewards)

        for i, env in enumerate(self._env):
            self._step_env(i, env, actions[i])
예제 #3
0
 def test_list(self):
     torch.manual_seed(1)
     states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
     dist = self.policy(states)
     actions = dist.sample()
     log_probs = dist.log_prob(actions)
     tt.assert_equal(actions, torch.tensor([1, 2, 1]))
     loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean()
     self.policy.reinforce(loss)
    def test_list(self):
        model = nn.Linear(2, 2)
        net = nn.ListNetwork(model, (2, ))
        features = torch.randn((4, 2))
        done = torch.tensor([1, 1, 0, 1], dtype=torch.uint8)
        out = net(State(features, done))
        tt.assert_almost_equal(
            out,
            torch.tensor([[0.0479387, -0.2268031], [0.2346841, 0.0743403],
                          [0., 0.], [0.2204496, 0.086818]]))

        features = torch.randn(3, 2)
        done = torch.tensor([1, 1, 1], dtype=torch.uint8)
        out = net(State(features, done))
        tt.assert_almost_equal(
            out,
            torch.tensor([[0.4234636, 0.1039939], [0.6514298, 0.3354351],
                          [-0.2543002, -0.2041451]]))
예제 #5
0
 def test_eval(self):
     states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1]))
     dist = self.policy.no_grad(states)
     tt.assert_almost_equal(dist.probs, torch.tensor([
         [0.352, 0.216, 0.432],
         [0.266, 0.196, 0.538],
         [0.469, 0.227, 0.304]
     ]), decimal=3)
     best = self.policy.eval(states)
     tt.assert_equal(best, torch.tensor([2, 2, 0]))
예제 #6
0
 def _append_time_feature(self, state):
     if self.timestep is None:
         self.timestep = torch.zeros(len(state),
                                     device=state.features.device)
     features = torch.cat((state.features, self.scale * self.timestep.view(
         (-1, 1))),
                          dim=1)
     state = State(features, state.mask, state.info)
     self.timestep = state.mask.float() * (self.timestep + 1)
     return state
예제 #7
0
    def _stack(self, state):
        if not self._frames:
            self._frames = [state.raw] * self._size
        else:
            self._frames = self._frames[1:] + [state.raw]

        if self._lazy:
            return LazyState(self._frames, state.mask, state.info)

        return State(torch.cat(self._frames, dim=1), state.mask, state.info)
예제 #8
0
    def test_rollout(self):
        buffer = NStepBatchBuffer(2, 3, discount_factor=0.5)
        actions = torch.ones((3))
        states = State(torch.arange(0, 12))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 4 * torch.ones(3))
        states, _, returns, next_states, lengths = buffer.sample(-1)

        expected_states = State(torch.arange(0, 6))
        expect_next_states = State(
            torch.cat((torch.arange(6, 9), torch.arange(6, 9))))
        expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float()
        expected_lengths = torch.tensor([2, 2, 2, 1, 1, 1]).long()

        self.assert_states_equal(states, expected_states)
        self.assert_states_equal(next_states, expect_next_states)
        tt.assert_allclose(returns, expected_returns)
        tt.assert_equal(lengths, expected_lengths)
 def eval(self, states):
     with torch.no_grad():
         training = self.model.training
         result = self.model(states.features.float())
         self.model.train(training)
         return State(
             result,
             mask=states.mask,
             info=states.info
         )
예제 #10
0
    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            action = self.policy(state)
            loss = torch.abs(target - action).mean()
            self.policy.reinforce(-loss)

        self.assertTrue(loss < 1)
예제 #11
0
 def test_reinforce_list(self):
     states = State(torch.randn(5, STATE_DIM),
                    mask=torch.tensor([1, 1, 0, 1, 0]))
     result = self.v(states)
     tt.assert_almost_equal(
         result, torch.tensor([0.7053187, 0.3975691, 0., 0.2701665, 0.]))
     self.v.reinforce(torch.tensor([1, -1, 1, 1, 1]).float())
     result = self.v(states)
     tt.assert_almost_equal(
         result, torch.tensor([0.9732854, 0.5453826, 0., 0.4344811, 0.]))
 def __call__(self, states):
     features = self.model(states.features.float())
     out = features.detach()
     out.requires_grad = True
     self._cache.append(features)
     self._out.append(out)
     return State(
         out,
         mask=states.mask,
         info=states.info
     )
    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 100):
            action = self.policy.greedy(state)
            loss = torch.abs(target - action).mean()
            loss.backward()
            self.policy.step()

        self.assertTrue(loss < 0.1)
예제 #14
0
    def test_run(self):
        state1 = State(torch.randn(1, STATE_DIM))
        dist1 = self.policy(state1)
        action1 = dist1.sample()
        log_prob1 = dist1.log_prob(action1)
        self.assertEqual(action1.item(), 0)

        state2 = State(torch.randn(1, STATE_DIM))
        dist2 = self.policy(state2)
        action2 = dist2.sample()
        log_prob2 = dist2.log_prob(action2)
        self.assertEqual(action2.item(), 2)

        loss = -(torch.tensor([-1, 1000000]) * torch.cat((log_prob1, log_prob2))).mean()
        self.policy.reinforce(loss)

        state3 = State(torch.randn(1, STATE_DIM))
        dist3 = self.policy(state3)
        action3 = dist3.sample()
        self.assertEqual(action3.item(), 2)
    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([0.25, 0.5, -0.5])

        for _ in range(0, 200):
            action, _ = self.policy(state)
            loss = ((target - action) ** 2).mean()
            loss.backward()
            self.policy.step()

        self.assertLess(loss, 0.2)
 def test_scaling(self):
     self.space = Box(np.array([-10, -5, 100]), np.array([10, -2, 200]))
     self.policy = SoftDeterministicPolicy(
         self.model,
         self.optimizer,
         self.space
     )
     state = State(torch.randn(1, STATE_DIM))
     action, log_prob = self.policy(state)
     tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]]))
     tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4)
예제 #17
0
 def test_eval_list(self):
     states = State(torch.randn(5, STATE_DIM),
                    mask=torch.tensor([1, 1, 0, 1, 0]))
     result = self.q.eval(states)
     tt.assert_almost_equal(result,
                            torch.tensor(
                                [[-0.238509, -0.726287, -0.034026],
                                 [-0.35688755, -0.6612102, 0.34849477],
                                 [0., 0., 0.], [0.1944, -0.5536, -0.2345],
                                 [0., 0., 0.]]),
                            decimal=2)
예제 #18
0
 def test_multi_reinforce(self):
     states = State(torch.randn(5, STATE_DIM),
                    mask=torch.tensor([1, 1, 0, 1, 0, 0]))
     self.v(states[0:2])
     self.v(states[2:4])
     self.v(states[4:6])
     self.v.reinforce(torch.tensor([1, 2]).float())
     self.v.reinforce(torch.tensor([1, 1]).float())
     self.v.reinforce(torch.tensor([1, 2]).float())
     with self.assertRaises(Exception):
         self.v.reinforce(torch.tensor([1, 2]).float())
예제 #19
0
 def _make_state(self, raw, done, info=None):
     if info is None:
         info = {"life_lost": False}
     elif not "life_lost" in info:
         info["life_lost"] = False
     return State(
         torch.from_numpy(
             np.moveaxis(np.array(raw, dtype=self.state_space.dtype), -1,
                         0)).unsqueeze(0).to(self._device),
         self._done_mask if done else self._not_done_mask,
         [info],
     )
예제 #20
0
 def _train(self):
     if len(self._buffer) >= self._batch_size:
         states = State.from_list(self._features)
         _, _, returns, next_states, rollout_lengths = self._buffer.sample(
             self._batch_size)
         td_errors = (returns + (self.discount_factor**rollout_lengths) *
                      self.v.eval(self.features.eval(next_states)) -
                      self.v(states))
         self.v.reinforce(td_errors)
         self.policy.reinforce(td_errors)
         self.features.reinforce()
         self._features = []
    def test_reinforce_one(self):
        state = State(torch.randn(1, STATE_DIM))
        dist = self.policy(state)
        action = dist.sample()
        log_prob1 = dist.log_prob(action)
        loss = -log_prob1.mean()
        self.policy.reinforce(loss)

        dist = self.policy(state)
        log_prob2 = dist.log_prob(action)

        self.assertGreater(log_prob2.item(), log_prob1.item())
예제 #22
0
    def test_run(self):
        states = State(torch.arange(0, 20))
        actions = torch.arange(0, 20).view((-1, 1))
        rewards = torch.arange(0, 20)
        expected_samples = State(
            torch.tensor([
                [0, 1, 2],
                [0, 1, 3],
                [5, 5, 5],
                [6, 6, 2],
                [7, 7, 7],
                [7, 8, 8],
                [7, 7, 7],
            ]))
        expected_weights = [
            [1.0000, 1.0000, 1.0000],
            [0.5659, 0.7036, 0.5124],
            [0.0631, 0.0631, 0.0631],
            [0.0631, 0.0631, 0.1231],
            [0.0631, 0.0631, 0.0631],
            [0.0776, 0.0631, 0.0631],
            [0.0866, 0.0866, 0.0866],
        ]
        actual_samples = []
        actual_weights = []
        for i in range(10):
            self.replay_buffer.store(states[i], actions[i], rewards[i],
                                     states[i + 1])
            if i > 2:
                sample = self.replay_buffer.sample(3)
                sample_states = sample[0].features
                self.replay_buffer.update_priorities(torch.randn(3))
                actual_samples.append(sample_states)
                actual_weights.append(sample[-1])

        actual_samples = State(torch.cat(actual_samples).view((-1, 3)))
        self.assert_states_equal(actual_samples, expected_samples)
        np.testing.assert_array_almost_equal(expected_weights,
                                             np.vstack(actual_weights),
                                             decimal=3)
    def test_parallel(self):
        buffer = GeneralizedAdvantageBuffer(self.v,
                                            self.features,
                                            2,
                                            2,
                                            discount_factor=0.5,
                                            lam=0.5)
        actions = torch.ones((2))
        states = [
            State(torch.tensor([[0], [3]])),
            State(torch.tensor([[1], [4]])),
            State(torch.tensor([[2], [5]])),
        ]
        rewards = torch.tensor([[1., 1], [2, 1], [4, 1]])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(State.from_list(states))).view(
            3, -1)
        tt.assert_almost_equal(values,
                               torch.tensor([[0.183, -1.408], [-0.348, -1.938],
                                             [-0.878, -2.468]]),
                               decimal=3)

        td_errors = torch.zeros(2, 2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors,
                               torch.tensor([[0.6436, 1.439], [1.909, 1.704]]),
                               decimal=3)

        advantages = torch.zeros(2, 2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages,
                               torch.tensor([[1.121, 1.865], [1.909, 1.704]]),
                               decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages.view(-1))
 def test_backward(self):
     states = self.features(self.states)
     loss = torch.tensor(0)
     loss = torch.sum(states.features)
     loss.backward()
     self.features.reinforce()
     features = self.features(self.states)
     expected = State(
         torch.tensor([[-0.71, -1.2, -0.5], [-0.72, -1.03, -0.02],
                       [-0.57, -1.3, -1.01]]),
         mask=torch.tensor([1, 0, 1]),
     )
     self.assert_state_equal(features, expected)
    def sample(self, batch_size):
        if batch_size > len(self):
            raise Exception("Not enough states for batch size!")

        states = self._states[0:batch_size]
        actions = self._actions[0:batch_size]
        actions = torch.tensor(actions, device=actions[0].device)
        next_states = self._next_states[0:batch_size]
        rewards = self._rewards[0:batch_size]
        rewards = torch.tensor(rewards, device=rewards[0].device, dtype=torch.float)
        lengths = self._lengths[0:batch_size]
        lengths = torch.tensor(lengths, device=rewards[0].device, dtype=torch.float)

        self._states = self._states[batch_size:]
        self._actions = self._actions[batch_size:]
        self._next_states = self._next_states[batch_size:]
        self._rewards = self._rewards[batch_size:]
        self._lengths = self._lengths[batch_size:]

        states = State.from_list(states)
        next_states = State.from_list(next_states)
        return states, actions, rewards, next_states, lengths
 def test_eval(self):
     state = State(torch.randn(1, STATE_DIM))
     dist = self.policy.no_grad(state)
     tt.assert_almost_equal(dist.mean,
                            torch.tensor([[-0.229, 0.43, -0.058]]),
                            decimal=3)
     tt.assert_almost_equal(dist.entropy(),
                            torch.tensor([4.251]),
                            decimal=3)
     best = self.policy.eval(state)
     tt.assert_almost_equal(best,
                            torch.tensor([[-0.229, 0.43, -0.058]]),
                            decimal=3)
    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            dist = self.policy(state)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            error = ((target - action)**2).mean()
            loss = (error * log_prob).mean()
            self.policy.reinforce(loss)

        self.assertTrue(error < 1)
예제 #28
0
    def act(self, state, reward):
        if not self._frames:
            self._frames = [state.raw] * self._size
        else:
            self._frames = self._frames[1:] + [state.raw]

        if self._lazy:
            state = LazyState(self._frames, state.mask, state.info)
        else:
            state = State(torch.cat(self._frames, dim=1), state.mask,
                          state.info)

        return self.agent.act(state, reward)
예제 #29
0
    def _summarize_transitions(self):
        sample_n = self.n_envs * self.n_steps
        sample_states = [None] * sample_n
        sample_actions = [None] * sample_n
        sample_next_states = [None] * sample_n

        for e in range(self.n_envs):
            next_state = self._states[self.n_steps][e]
            for i in range(self.n_steps):
                t = self.n_steps - 1 - i
                idx = t * self.n_envs + e
                state = self._states[t][e]
                action = self._actions[t][e]

                sample_states[idx] = state
                sample_actions[idx] = action
                sample_next_states[idx] = next_state

                if not state.mask:
                    next_state = state

        return (State.from_list(sample_states), torch.stack(sample_actions),
                State.from_list(sample_next_states))
 def test_run(self):
     states = torch.arange(0, 20)
     actions = torch.arange(0, 20)
     rewards = torch.arange(0, 20)
     expected_samples = torch.tensor([[0, 0, 0], [1, 1, 0], [0, 1, 1],
                                      [3, 0, 0], [1, 4, 4], [1, 2, 4],
                                      [2, 4, 3], [4, 7, 4], [7, 4, 6],
                                      [6, 5, 6]])
     expected_weights = np.ones((10, 3))
     actual_samples = []
     actual_weights = []
     for i in range(10):
         state = State(states[i].unsqueeze(0), torch.tensor([1]))
         next_state = State(states[i + 1].unsqueeze(0), torch.tensor([1]))
         self.replay_buffer.store(state, actions[i], rewards[i], next_state)
         sample = self.replay_buffer.sample(3)
         actual_samples.append(sample[0].features)
         actual_weights.append(sample[-1])
     tt.assert_equal(
         torch.cat(actual_samples).view(expected_samples.shape),
         expected_samples)
     np.testing.assert_array_equal(expected_weights,
                                   np.vstack(actual_weights))