def test_run(self):
        state1 = State(torch.randn(1, STATE_DIM))
        dist1 = self.policy(state1)
        action1 = dist1.sample()
        log_prob1 = dist1.log_prob(action1)
        self.assertEqual(action1.item(), 0)

        state2 = State(torch.randn(1, STATE_DIM))
        dist2 = self.policy(state2)
        action2 = dist2.sample()
        log_prob2 = dist2.log_prob(action2)
        self.assertEqual(action2.item(), 2)

        loss = -(torch.tensor([-1, 1000000]) * torch.cat(
            (log_prob1, log_prob2))).mean()
        self.policy.reinforce(loss)

        state3 = State(torch.randn(1, STATE_DIM))
        dist3 = self.policy(state3)
        action3 = dist3.sample()
        self.assertEqual(action3.item(), 2)
 def test_from_dict(self):
     observation = torch.randn(3, 4)
     state = State({
         'observation': observation,
         'done': True,
         'mask': 1,
         'reward': 5.
     })
     tt.assert_equal(state.observation, observation)
     self.assertEqual(state.done, True)
     self.assertEqual(state.mask, 1.)
     self.assertEqual(state.reward, 5.)
    def test_reinforce_one(self):
        state = State(torch.randn(1, STATE_DIM))
        dist = self.policy(state)
        action = dist.sample()
        log_prob1 = dist.log_prob(action)
        loss = -log_prob1.mean()
        self.policy.reinforce(loss)

        dist = self.policy(state)
        log_prob2 = dist.log_prob(action)

        self.assertGreater(log_prob2.item(), log_prob1.item())
Example #4
0
    def _reset(self):
        state = State.from_gym(self._reset_orig(),
                               dtype=self.observation_space.dtype,
                               device="cuda")
        obs = state.observation.cpu().numpy()
        r = state.reward
        done = state.done
        info = {}

        pilot_action = onehot_encode(self.pilot_policy(state))
        obs = np.concatenate((obs, pilot_action))
        return obs
    def test_parallel(self):
        buffer = GeneralizedAdvantageBuffer(
            self.v,
            self.features,
            2,
            2,
            discount_factor=0.5,
            lam=0.5
        )
        actions = torch.ones((2))

        def make_states(x, y):
            return State.array([
                State({'observation': torch.tensor([float(x)])}),
                State({'observation': torch.tensor([float(y)])})
            ])

        states = State.array([
            make_states(0, 3),
            make_states(1, 4),
            make_states(2, 5),
        ])
        self.assertEqual(states.shape, (3, 2))
        rewards = torch.tensor([[1., 1], [2, 1], [4, 1]])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(states)).view(3, -1)
        tt.assert_almost_equal(values, torch.tensor([
            [0.183, -1.408],
            [-0.348, -1.938],
            [-0.878, -2.468]
        ]), decimal=3)

        td_errors = torch.zeros(2, 2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors, torch.tensor([
            [0.6436, 1.439],
            [1.909, 1.704]
        ]), decimal=3)

        advantages = torch.zeros(2, 2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages, torch.tensor([
            [1.121, 1.865],
            [1.909, 1.704]
        ]), decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages.view(-1))
Example #6
0
    def test_run(self):
        states = StateArray(torch.arange(0, 20), (20, ),
                            reward=torch.arange(-1, 19).float())
        actions = torch.arange(0, 20).view((-1, 1))
        expected_samples = State(
            torch.tensor([
                [0, 1, 2],
                [0, 1, 3],
                [5, 5, 5],
                [6, 6, 2],
                [7, 7, 7],
                [7, 8, 8],
                [7, 7, 7],
            ]))
        expected_weights = [
            [1.0000, 1.0000, 1.0000],
            [0.5659, 0.7036, 0.5124],
            [0.0631, 0.0631, 0.0631],
            [0.0631, 0.0631, 0.1231],
            [0.0631, 0.0631, 0.0631],
            [0.0776, 0.0631, 0.0631],
            [0.0866, 0.0866, 0.0866],
        ]
        actual_samples = []
        actual_weights = []
        for i in range(10):
            self.replay_buffer.store(states[i], actions[i], states[i + 1])
            if i > 2:
                sample = self.replay_buffer.sample(3)
                sample_states = sample[0].observation
                self.replay_buffer.update_priorities(torch.randn(3))
                actual_samples.append(sample_states)
                actual_weights.append(sample[-1])

        actual_samples = State(torch.cat(actual_samples).view((-1, 3)))
        self.assert_states_equal(actual_samples, expected_samples)
        np.testing.assert_array_almost_equal(expected_weights,
                                             np.vstack(actual_weights),
                                             decimal=3)
Example #7
0
    def _step(self, action):
        state = State.from_gym(self._step_orig(_convert(disc_to_cont(action))),
                               dtype=self.observation_space.dtype,
                               device="cuda")

        obs = state.observation.cpu().numpy()
        r = state.reward
        done = state.done
        info = {}

        pilot_action = onehot_encode(self.pilot_policy(state))
        obs = np.concatenate((obs, pilot_action))
        return obs, r, done, info
    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            dist = self.policy(state)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            error = ((target - action)**2).mean()
            loss = (error * log_prob).mean()
            self.policy.reinforce(loss)

        self.assertTrue(error < 1)
 def step(self, action):
     observation, reward, done, info = self._env.last()
     if self._env.dones[self._env.agent_selection]:
         action = None
     if torch.is_tensor(action):
         self._env.step(action.item())
     else:
         self._env.step(action)
     observation, reward, done, info = self._env.last()
     return State.from_gym((observation.reshape(
         (1, 84, 84)), reward, done, info),
                           device=self.device,
                           dtype=np.uint8)
 def test_eval(self):
     state = State(torch.randn(1, STATE_DIM))
     dist = self.policy.no_grad(state)
     tt.assert_almost_equal(dist.mean,
                            torch.tensor([[-0.233, 0.459, -0.058]]),
                            decimal=3)
     tt.assert_almost_equal(dist.entropy(),
                            torch.tensor([4.251]),
                            decimal=3)
     best = self.policy.eval(state)
     tt.assert_almost_equal(best,
                            torch.tensor([[-0.233, 0.459, -0.058]]),
                            decimal=3)
    def _summarize_transitions(self):
        sample_n = self.n_envs * self.n_steps
        sample_states = [None] * sample_n
        sample_actions = [None] * sample_n
        sample_next_states = [None] * sample_n

        for e in range(self.n_envs):
            next_state = self._states[self.n_steps][e]
            for i in range(self.n_steps):
                t = self.n_steps - 1 - i
                idx = t * self.n_envs + e
                state = self._states[t][e]
                action = self._actions[t][e]

                sample_states[idx] = state
                sample_actions[idx] = action
                sample_next_states[idx] = next_state

                if not state.mask:
                    next_state = state

        return (State.array(sample_states), torch.stack(sample_actions),
                State.array(sample_next_states))
Example #12
0
    def test_run(self):
        np.random.seed(1)
        random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = ExperienceReplayBuffer(5)

        states = torch.arange(0, 20)
        actions = torch.arange(0, 20).view((-1, 1))
        rewards = torch.arange(0, 20)
        expected_samples = torch.tensor([
            [0, 0, 0],
            [1, 1, 0],
            [0, 1, 1],
            [3, 0, 0],
            [1, 4, 4],
            [1, 2, 4],
            [2, 4, 3],
            [4, 7, 4],
            [7, 4, 6],
            [6, 5, 6],
        ])
        expected_weights = np.ones((10, 3))
        actual_samples = []
        actual_weights = []
        for i in range(10):
            state = State(states[i])
            next_state = State(states[i + 1], reward=rewards[i])
            self.replay_buffer.store(state, actions[i], next_state)
            sample = self.replay_buffer.sample(3)
            actual_samples.append(sample[0].observation)
            actual_weights.append(sample[-1])
        tt.assert_equal(
            torch.cat(actual_samples).view(expected_samples.shape),
            expected_samples)
        np.testing.assert_array_equal(expected_weights,
                                      np.vstack(actual_weights))
    def advantages(self, states):
        if len(self) < self._batch_size:
            raise Exception("Not enough states received!")

        self._states.append(states)
        states = State.array(self._states[0:self.n_steps + 1])
        actions = torch.cat(self._actions[:self.n_steps], dim=0)
        rewards = torch.stack(self._rewards[:self.n_steps])
        _values = self.v.target(self.features.target(states))
        values = _values[0:self.n_steps]
        next_values = _values[1:]
        td_errors = rewards + self.gamma * next_values - values
        advantages = self._compute_advantages(td_errors)
        self._clear_buffers()
        return (states[0:-1].flatten(), actions, advantages.view(-1))
    def test_reinforce(self):
        def loss(log_probs):
            return -log_probs.mean()

        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1]))
        actions = self.policy.no_grad(states).sample()

        # notice the values increase with each successive reinforce
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3)
        self.policy.reinforce(loss(log_probs))
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3)
        self.policy.reinforce(loss(log_probs))
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
    def _terminal(self, state, reward):
        self._rewards.append(reward)
        features = State.array(self._features)
        rewards = torch.tensor(self._rewards, device=features.device)
        log_pis = torch.stack(self._log_pis)
        self._trajectories.append((features, rewards, log_pis))
        self._current_batch_size += len(features)
        self._features = []
        self._rewards = []
        self._log_pis = []

        if self._current_batch_size >= self.min_batch_size:
            self._train()

        # have to return something
        return self.policy.no_grad(self.features.no_grad(state)).sample()
Example #16
0
 def test_single_env(self):
     state = State(torch.randn(4))
     self.agent.act(state)
     tt.assert_allclose(self.test_agent.last_state.observation,
                        torch.tensor(
                            [0.3923, -0.2236, -0.3195, -1.2050, 0.]),
                        atol=1e-04)
     self.agent.act(state)
     tt.assert_allclose(self.test_agent.last_state.observation,
                        torch.tensor(
                            [0.3923, -0.2236, -0.3195, -1.2050, 1e-3]),
                        atol=1e-04)
     self.agent.act(state)
     tt.assert_allclose(self.test_agent.last_state.observation,
                        torch.tensor(
                            [0.3923, -0.2236, -0.3195, -1.2050, 2e-3]),
                        atol=1e-04)
Example #17
0
    def reset(self) -> None:
        """
        Reset the environment and return a new intial state.

        Returns
        -------
        State
            The initial state for the next episode.
        """
        # environment "state"
        self._state = State({
            'observation': torch.tensor([self._start_state]),  # pylint: disable=not-callable
            'reward': 0,
            'done': False
        })
        self._reward = 0
        self._action = None
        self._timestep = 0
        self._done = False
    def test_target_net(self):
        torch.manual_seed(2)
        model = nn.Sequential(nn.Linear(1, 1))
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        q = QNetwork(model, optimizer, target=FixedTarget(3))
        inputs = State(torch.tensor([1.]))

        def loss(policy_value):
            target = policy_value - 1
            return smooth_l1_loss(policy_value, target.detach())

        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.008584141731262207)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.20858412981033325)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.4085841178894043)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.6085841655731201)
        np.testing.assert_equal(target_value, -0.6085841655731201)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.8085841536521912)
        np.testing.assert_equal(target_value, -0.6085841655731201)
Example #19
0
 def test_auto_mask_true(self):
     observation = torch.randn(3, 4)
     state = State({'observation': observation, 'done': True, 'reward': 5.})
     self.assertEqual(state.mask, 0.)
 def reset(self):
     self._state = State.array([sub_env.reset() for sub_env in self._envs])
     return self._state
 def test_multi_action(self):
     states = State(torch.randn(3, STATE_DIM))
     actions = self.policy(states).sample()
     tt.assert_equal(actions, torch.tensor([2, 2, 0]))
Example #22
0
 def test_step_one(self):
     state = State(torch.randn(1, STATE_DIM))
     self.policy(state)
     self.policy.step()
 def _aggregate_states(self):
     return State.array([env.state for env in self._envs])
 def test_forward_state(self):
     inputs = State({
         'observation': torch.tensor([1, 2, 3])
     })
     outputs = self.model(inputs)
     self.assertEqual(inputs, outputs)
Example #25
0
 def reset(self):
     state = self._env.reset(), 0., False, None
     self._state = State.from_gym(state, dtype=self._env.observation_space.dtype, device=self._device)
     return self._state
Example #26
0
 def step(self, action):
     self._state = State.from_gym(self._env.step(self._convert(action)),
                                  dtype=self._env.observation_space.dtype,
                                  device=self._device)
     return self._state
 def test_apply_mask(self):
     observation = torch.randn(3, 4)
     state = State.from_gym((observation, 0., True, {}))
     tt.assert_equal(state.apply_mask(observation), torch.zeros(3, 4))
 def test_as_output(self):
     observation = torch.randn(3, 4)
     state = State(observation)
     tensor = torch.randn(1, 5, 3)
     self.assertEqual(state.as_output(tensor).shape, (5, 3))
 def test_as_input(self):
     observation = torch.randn(3, 4)
     state = State(observation)
     self.assertEqual(state.as_input('observation').shape, (1, 3, 4))
 def test_to_device(self):
     observation = torch.randn(3, 4)
     state = State(observation, device=torch.device('cpu'))
     state_cpu = state.to("cpu")
     self.assertTrue(torch.equal(state['observation'], state_cpu['observation']))
     self.assertFalse(state is state_cpu)