Beispiel #1
0
    def test_rollout_with_nones(self):
        buffer = NStepBatchBuffer(3, 3, discount_factor=0.5)
        done = torch.ones(12)
        done[5] = 0
        done[7] = 0
        done[9] = 0
        states = State(torch.arange(0, 12), done)
        actions = torch.ones((3))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 2 * torch.ones(3))
        buffer.store(states[9:12], actions, 4 * torch.ones(3))
        states, actions, returns, next_states, lengths = buffer.sample(-1)

        expected_states = State(torch.arange(0, 9), done[0:9])
        expected_next_done = torch.zeros(9)
        expected_next_done[5] = 1
        expected_next_done[7] = 1
        expected_next_done[8] = 1
        expect_next_states = State(
            torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]), expected_next_done)
        expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float()
        expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float()

        self.assert_states_equal(states, expected_states)
        self.assert_states_equal(next_states, expect_next_states)
        tt.assert_equal(lengths, expected_lengths)
        tt.assert_allclose(returns, expected_returns)
Beispiel #2
0
    def test_rollout(self):
        buffer = NStepAdvantageBuffer(self.v,
                                      self.features,
                                      2,
                                      3,
                                      discount_factor=0.5)
        actions = torch.ones((3))
        states = State(torch.arange(0, 12).unsqueeze(1))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        states, _, advantages = buffer.advantages(states[6:9])

        expected_states = State(torch.arange(0, 6).unsqueeze(1))
        expected_next_states = State(
            torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1))
        expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float()
        expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1])

        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(expected_states,
                                              expected_returns,
                                              expected_next_states,
                                              expected_lengths))
Beispiel #3
0
    def test_rollout(self):
        buffer = NStepBuffer(2, discount_factor=0.5)
        actions = torch.ones((3))
        states = State(torch.arange(0, 12))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 2 * torch.ones(3))
        buffer.store(states[9:12], actions, 4 * torch.ones(3))
        self.assertEqual(len(buffer), 6)

        states, actions, returns, next_states, lengths = buffer.sample(6)
        expected_states = State(torch.arange(0, 6))
        expected_next_states = State(torch.arange(6, 12))
        expected_returns = torch.tensor([
            2,
            2,
            2,
            4,
            4,
            4,
        ]).float()
        expected_lengths = torch.tensor([
            2,
            2,
            2,
            2,
            2,
            2,
        ])
        self.assert_states_equal(states, expected_states)
        self.assert_states_equal(next_states, expected_next_states)
        tt.assert_allclose(returns, expected_returns)
        tt.assert_equal(lengths, expected_lengths)
Beispiel #4
0
    def test_rollout_with_nones(self):
        buffer = NStepAdvantageBuffer(self.v,
                                      self.features,
                                      3,
                                      3,
                                      discount_factor=0.5)
        done = torch.ones(12)
        done[5] = 0
        done[7] = 0
        done[9] = 0
        states = State(torch.arange(0, 12).unsqueeze(1), done)
        actions = torch.ones((3))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 2 * torch.ones(3))
        states, actions, advantages = buffer.advantages(states[9:12])

        expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9])
        expected_next_done = torch.zeros(9)
        expected_next_done[5] = 1
        expected_next_done[7] = 1
        expected_next_done[8] = 1
        expected_next_states = State(
            torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1),
            expected_next_done)
        expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float()
        expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float()

        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(expected_states,
                                              expected_returns,
                                              expected_next_states,
                                              expected_lengths))
Beispiel #5
0
 def test_output_shape(self):
     state = State(torch.randn(1, STATE_DIM))
     action = self.policy(state)
     self.assertEqual(action.shape, (1, ACTION_DIM))
     state = State(torch.randn(5, STATE_DIM))
     action = self.policy(state)
     self.assertEqual(action.shape, (5, ACTION_DIM))
    def test_done(self):
        state = State(torch.tensor([1]))
        action = torch.tensor(0)
        done_state = State(torch.tensor([1]), mask=torch.tensor([0]))

        self.replay_buffer.store(state, action, 1, done_state)
        self.assertEqual(len(self.replay_buffer), 1)
        sample = self.replay_buffer.buffer.buffer[0]
        self.assert_states_equal(state, sample[0])
        self.assertEqual(sample[2], 1)

        self.replay_buffer.store(state, action, 1, state)
        self.replay_buffer.store(state, action, 1, state)
        self.assertEqual(len(self.replay_buffer), 1)

        self.replay_buffer.store(state, action, 1, done_state)
        self.assertEqual(len(self.replay_buffer), 4)
        sample = self.replay_buffer.buffer.buffer[1]
        self.assert_states_equal(sample[0], state)
        self.assertEqual(sample[2], 1.75)
        self.assert_states_equal(sample[3], done_state)

        self.replay_buffer.store(state, action, 1, done_state)
        self.assertEqual(len(self.replay_buffer), 5)
        sample = self.replay_buffer.buffer.buffer[0]
        self.assert_states_equal(state, sample[0])
        self.assertEqual(sample[2], 1)
 def test_run(self):
     states = torch.arange(0, 20)
     actions = torch.arange(0, 20).view((-1, 1))
     rewards = torch.arange(0, 20)
     expected_samples = torch.tensor([
         [0, 0, 0],
         [1, 1, 0],
         [0, 1, 1],
         [3, 0, 0],
         [1, 4, 4],
         [1, 2, 4],
         [2, 4, 3],
         [4, 7, 4],
         [7, 4, 6],
         [6, 5, 6],
     ])
     expected_weights = np.ones((10, 3))
     actual_samples = []
     actual_weights = []
     for i in range(10):
         state = State(states[i].unsqueeze(0), torch.tensor([1]))
         next_state = State(states[i + 1].unsqueeze(0), torch.tensor([1]))
         self.replay_buffer.store(state, actions[i], rewards[i], next_state)
         sample = self.replay_buffer.sample(3)
         actual_samples.append(sample[0].features)
         actual_weights.append(sample[-1])
     tt.assert_equal(
         torch.cat(actual_samples).view(expected_samples.shape),
         expected_samples)
     np.testing.assert_array_equal(expected_weights,
                                   np.vstack(actual_weights))
Beispiel #8
0
    def test_list(self):
        model = nn.Linear(2, 2)
        net = nn.RLNetwork(model, (2, ))
        features = torch.randn((4, 2))
        done = torch.tensor([1, 1, 0, 1], dtype=torch.uint8)
        out = net(State(features, done))
        tt.assert_almost_equal(
            out,
            torch.tensor([
                [0.0479387, -0.2268031],
                [0.2346841, 0.0743403],
                [0.0, 0.0],
                [0.2204496, 0.086818],
            ]),
        )

        features = torch.randn(3, 2)
        done = torch.tensor([1, 1, 1], dtype=torch.uint8)
        out = net(State(features, done))
        tt.assert_almost_equal(
            out,
            torch.tensor([
                [0.4234636, 0.1039939],
                [0.6514298, 0.3354351],
                [-0.2543002, -0.2041451],
            ]),
        )
Beispiel #9
0
 def test_multi_batch_reinforce(self):
     self.policy(State(torch.randn(2, STATE_DIM)))
     self.policy(State(torch.randn(2, STATE_DIM)))
     self.policy(State(torch.randn(2, STATE_DIM)))
     self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float())
     self.policy.reinforce(torch.tensor([1, 2]).float())
     with self.assertRaises(Exception):
         self.policy.reinforce(torch.tensor([1, 2]).float())
Beispiel #10
0
 def forward(self, states, actions=None):
     x = self.fc(states.features)
     x = x.view((-1, 64, 7, 7))
     x = self.deconv(x)
     if actions is None:
         return State(x.view((-1, FRAMES, 84, 84)))
     x = x.view((-1, self.num_actions, FRAMES, 84, 84))
     return State(x[torch.arange(len(x)), actions].view((-1, FRAMES, 84, 84)))
    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action, log_prob = self.policy(state)
        self.assertEqual(action.shape, (1, ACTION_DIM))
        self.assertEqual(log_prob.shape, torch.Size([1]))

        state = State(torch.randn(5, STATE_DIM))
        action, log_prob = self.policy(state)
        self.assertEqual(action.shape, (5, ACTION_DIM))
        self.assertEqual(log_prob.shape, torch.Size([5]))
Beispiel #12
0
 def test_run(self):
     state = State(torch.randn(1, STATE_DIM))
     action = self.policy(state)
     self.assertEqual(action.item(), 0)
     state = State(torch.randn(1, STATE_DIM))
     action = self.policy(state)
     self.assertEqual(action.item(), 2)
     self.policy.reinforce(torch.tensor([-1, 1000000]).float())
     action = self.policy(state)
     self.assertEqual(action.item(), 2)
 def test_deflicker(self):
     frame1 = State(torch.ones((1, 3, 4, 4)))
     frame2 = State(torch.ones((1, 3, 4, 4)))
     frame3 = State(torch.ones((1, 3, 4, 4)) * 2)
     self.body.act(frame1, 0)
     self.body.act(frame2, 0)
     self.body.act(frame3, 0)
     self.body.act(frame2, 0)
     self.body.act(frame2, 0)
     expected = torch.cat((torch.ones(1, 2, 2), torch.ones(2, 2, 2) * 2,
                           torch.ones(1, 2, 2))).unsqueeze(0)
     tt.assert_equal(self.agent.state.features, expected)
 def forward(self, states):
     features = self.model(states.features.float())
     return State(
         features,
         mask=states.mask,
         info=states.info
     )
Beispiel #15
0
 def act(self, state, reward):
     if self.timestep is None:
         self.timestep = torch.zeros(len(state), device=state.features.device)
     features = torch.cat((state.features, self.scale * self.timestep.view((-1, 1))), dim=1)
     state = State(features, state.mask, state.info)
     self.timestep = state.mask * (self.timestep + 1)
     return self.agent.act(state, reward)
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, 3))

        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
        self.features = FeatureNetwork(self.model, optimizer)
        self.states = State(torch.randn(3, STATE_DIM),
                            mask=torch.tensor([1, 0, 1]))
        self.expected_features = State(
            torch.tensor([
                [-0.2385, -0.7263, -0.0340],
                [-0.3569, -0.6612, 0.3485],
                [-0.0296, -0.7566, -0.4624],
            ]),
            mask=torch.tensor([1, 0, 1]),
        )
Beispiel #17
0
 def test_done(self):
     states = State(torch.randn((3, STATE_DIM)),
                    mask=torch.tensor([1, 0, 1]))
     probs = self.q(states)
     self.assertEqual(probs.shape, (3, ACTIONS, ATOMS))
     tt.assert_almost_equal(
         probs.sum(dim=2),
         torch.tensor([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]),
         decimal=3,
     )
     tt.assert_almost_equal(
         probs,
         torch.tensor([
             [
                 [0.2065, 0.1045, 0.1542, 0.2834, 0.2513],
                 [0.3903, 0.2471, 0.0360, 0.1733, 0.1533],
             ],
             [[0, 0, 1, 0, 0], [0, 0, 1, 0, 0]],
             [
                 [0.1427, 0.2486, 0.0946, 0.4112, 0.1029],
                 [0.0819, 0.1320, 0.1203, 0.0373, 0.6285],
             ],
         ]),
         decimal=3,
     )
Beispiel #18
0
 def act(self, state, reward):
     for i in range(len(state)):
         if state.info[i]['life_lost']:
             mask = state.mask.clone()
             mask[i] = 0
             state = State(state.raw, mask=mask, info=state.info)
     return self.agent.act(state, reward)
Beispiel #19
0
 def act(self, state, reward):
     if self._lost_life():
         state = State(state.raw, state.mask * 0, state.info)
         self.terminal(state, reward)
         self._lives = self._get_lives()
         return self.initial(state)
     return self.agent.act(state, reward)
 def _done_on_life_lost(self, state):
     for i in range(len(state)):
         if state.info[i]['life_lost']:
             mask = state.mask.clone()
             mask[i] = 0
             state = State(state.raw, mask=mask, info=state.info)
     return state
Beispiel #21
0
    def test_reinforce(self):
        states = State(torch.randn((3, STATE_DIM)))
        actions = torch.tensor([0, 1, 0])
        original_probs = self.q(states, actions)
        tt.assert_almost_equal(
            original_probs,
            torch.tensor([
                [0.2065, 0.1045, 0.1542, 0.2834, 0.2513],
                [0.3190, 0.2471, 0.0534, 0.1424, 0.2380],
                [0.1427, 0.2486, 0.0946, 0.4112, 0.1029],
            ]),
            decimal=3,
        )

        target_dists = torch.tensor([[0, 0, 1, 0, 0], [0, 0, 0, 0, 1],
                                     [0, 1, 0, 0, 0]]).float()

        def _loss(dist, target_dist):
            log_dist = torch.log(torch.clamp(dist, min=1e-5))
            log_target_dist = torch.log(torch.clamp(target_dist, min=1e-5))
            return (target_dist *
                    (log_target_dist - log_dist)).sum(dim=-1).mean()

        self.q.reinforce(_loss(original_probs, target_dists))

        new_probs = self.q(states, actions)
        tt.assert_almost_equal(torch.sign(new_probs - original_probs),
                               torch.sign(target_dists - 0.5))
Beispiel #22
0
    def _train(self):
        # forward pass
        values = torch.cat([
            self.v(State(features)) for (features, _, _) in self._trajectories
        ])

        # forward passes for log_pis were stored during execution
        log_pis = torch.cat(
            [log_pis for (_, _, log_pis) in self._trajectories])

        # compute targets
        targets = torch.cat([
            self._compute_discounted_returns(rewards)
            for (_, rewards, _) in self._trajectories
        ])
        advantages = targets - values.detach()

        # compute losses
        value_loss = mse_loss(values, targets)
        policy_loss = -(advantages * log_pis).mean()

        # backward pass
        self.v.reinforce(value_loss)
        self.policy.reinforce(policy_loss)
        self.features.reinforce()

        # cleanup
        self._trajectories = []
        self._current_batch_size = 0
    def test_target(self):
        self.policy = DeterministicPolicy(
            self.model,
            self.optimizer,
            self.space,
            target=FixedTarget(3)
        )

        # choose initial action
        state = State(torch.ones(1, STATE_DIM))
        action = self.policy(state)
        tt.assert_equal(action, torch.zeros(1, ACTION_DIM))

        # run update step, make sure target network doesn't change
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_allclose(
            self.policy.eval(state),
            torch.tensor([[-0.595883, -0.595883, -0.595883]]),
            atol=1e-4,
        )
    def test_repeat_actions(self):
        done = torch.ones(14)
        done[3] = 0
        done[5] = 0
        states = State(torch.arange(0, 14), done)
        rewards = torch.ones(2)

        agent = MockAgent(2)
        body = ParallelRepeatActions(agent, repeats=3)

        actions = body.act(states[0:2], rewards)
        self.assert_array_equal(actions, [1, 1])
        actions = body.act(states[2:4], rewards)
        self.assert_array_equal(actions, [1, None])
        actions = body.act(states[4:6], rewards)
        self.assert_array_equal(actions, [1, None])
        actions = body.act(states[6:8], rewards)
        self.assert_array_equal(actions, [2, 2])
        actions = body.act(states[8:10], rewards)
        self.assert_array_equal(actions, [2, 2])
        actions = body.act(states[10:12], rewards)
        self.assert_array_equal(actions, [2, 2])
        actions = body.act(states[12:14], rewards)
        self.assert_array_equal(actions, [3, 3])

        self.assertEqual(len(agent._states), 3)
        tt.assert_equal(torch.cat(agent._rewards),
                        torch.tensor([[1, 1], [3, 3], [3, 3]]))
    def test_target(self):
        self.policy = DeterministicPolicy(
            self.model,
            self.optimizer,
            self.space,
            target=FixedTarget(3)
        )
        state = State(torch.ones(1, STATE_DIM))

        # run update step, make sure target network doesn't change
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_allclose(
            self.policy.target(state),
            torch.tensor([[-0.574482, -0.574482, -0.574482]]),
            atol=1e-4,
        )
    def test_simple(self):
        buffer = GeneralizedAdvantageBuffer(self.v,
                                            self.features,
                                            2,
                                            1,
                                            discount_factor=0.5,
                                            lam=0.5)
        actions = torch.ones((1))
        states = State(torch.arange(0, 3).unsqueeze(1))
        rewards = torch.tensor([1., 2, 4])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(states))
        tt.assert_almost_equal(values,
                               torch.tensor([0.1826, -0.3476, -0.8777]),
                               decimal=3)

        td_errors = torch.zeros(2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors,
                               torch.tensor([0.6436, 1.909]),
                               decimal=3)

        advantages = torch.zeros(2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages,
                               torch.tensor([1.121, 1.909]),
                               decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages)
        tt.assert_equal(_actions, torch.tensor([1, 1]))
Beispiel #27
0
 def test_eval_actions(self):
     states = State(torch.randn(3, STATE_DIM))
     actions = [1, 2, 0]
     result = self.q.eval(states, actions)
     self.assertEqual(result.shape, torch.Size([3]))
     tt.assert_almost_equal(
         result, torch.tensor([-0.7262873, 0.3484948, -0.0296164]))
Beispiel #28
0
    def test_rollout_with_nones(self):
        buffer = NStepBuffer(3, discount_factor=0.5)
        done = torch.ones(15)
        # [
        #     0, 1, 2,
        #     3, 4, 5,
        #     6, 7, 8,
        #     9, 11, 12,
        #     13, 14, 15
        # ]
        done[9] = 0
        done[7] = 0
        done[5] = 0
        states = State(torch.arange(0, 15), done)
        actions = torch.ones((3))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 2 * torch.ones(3))
        buffer.store(states[9:12], actions, 4 * torch.ones(3))
        buffer.store(states[12:15], actions, 8 * torch.ones(3))
        states, actions, returns, next_states, lengths = buffer.sample(6)

        expected_states = State(torch.arange(0, 6),
                                torch.tensor([1, 1, 1, 1, 1, 0]))
        expected_next_states = State(torch.tensor([9, 7, 5, 9, 7, 5]),
                                     torch.zeros(6))
        expected_returns = torch.tensor([
            3,
            2,
            1,
            4,
            2,
            0,
        ]).float()
        expected_lengths = torch.tensor([
            3,
            2,
            1,
            2,
            1,
            0,
        ])

        self.assert_states_equal(states, expected_states)
        self.assert_states_equal(next_states, expected_next_states)
        tt.assert_allclose(returns, expected_returns)
        tt.assert_equal(lengths, expected_lengths)
Beispiel #29
0
 def act(self, state, reward):
     if not self._frames:
         self._frames = [state.raw] * self._size
     else:
         self._frames = self._frames[1:] + [state.raw]
     return self.agent.act(
         State(torch.cat(self._frames, dim=1), state.mask, state.info),
         reward)
 def setUp(self):
     np.random.seed(0)
     self.agent = MockAgent()
     self.env = MockEnv()
     self.frame = State(torch.ones((1, 3, 4, 4)))
     self.body = DeepmindAtariBody(ToLegacyBody(self.agent),
                                   self.env,
                                   noop_max=10)