Ejemplo n.º 1
0
    def test_discounted_transition(self):
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3)

        self.source.n_step_buffer.append(self.experience01)
        self.source.n_step_buffer.append(self.experience02)
        self.source.n_step_buffer.append(self.experience03)

        reward, next_state, done = self.source.get_transition_info()

        reward_01 = self.experience02.reward + 0.9 * self.experience03.reward * (1 - done)
        reward_gt = self.experience01.reward + 0.9 * reward_01 * (1 - done)

        self.assertEqual(reward, reward_gt)
        self.assertEqual(next_state.all(), self.next_state_02.all())
        self.assertEqual(self.experience03.done, done)
Ejemplo n.º 2
0
    def __init__(self,
                 env: str,
                 gpus: int = 0,
                 eps_start: float = 1.0,
                 eps_end: float = 0.02,
                 eps_last_frame: int = 150000,
                 sync_rate: int = 1000,
                 gamma: float = 0.99,
                 learning_rate: float = 1e-4,
                 batch_size: int = 32,
                 replay_size: int = 100000,
                 warm_start_size: int = 10000,
                 num_samples: int = 500,
                 n_steps=4):
        """
        PyTorch Lightning implementation of `N-Step DQN <http://incompleteideas.net/papers/sutton-88-with-erratum.pdf>`_

        Paper authors: Richard Sutton

        Model implemented by:

            - `Donal Byrne <https://github.com/djbyrne>`

        Example:

            >>> from pl_bolts.models.rl.n_step_dqn.model import NStepDQN
            ...
            >>> model = NStepDQN("PongNoFrameskip-v4")

        Train::

            trainer = Trainer()
            trainer.fit(model)

        Args:
            env: gym environment tag
            gpus: number of gpus being used
            eps_start: starting value of epsilon for the epsilon-greedy exploration
            eps_end: final value of epsilon for the epsilon-greedy exploration
            eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end
            sync_rate: the number of iterations between syncing up the target network with the train network
            gamma: discount factor
            learning_rate: learning rate
            batch_size: size of minibatch pulled from the DataLoader
            replay_size: total capacity of the replay buffer
            warm_start_size: how many random steps through the environment to be carried out at the start of
            training to fill the buffer with a starting point
            num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader
            n_steps: number of steps to approximate and use in the bellman update
        """
        super().__init__(env, gpus, eps_start, eps_end, eps_last_frame,
                         sync_rate, gamma, learning_rate, batch_size,
                         replay_size, warm_start_size, num_samples)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.source = NStepExperienceSource(self.env,
                                            self.agent,
                                            device,
                                            n_steps=n_steps)
Ejemplo n.º 3
0
    def test_multi_step_discount(self):
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3)
        self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock()))

        self.source.n_step_buffer.append(self.experience01)
        self.source.n_step_buffer.append(self.experience02)

        reward_gt = 1.71

        exp, reward, done = self.source.step()

        self.assertEqual(exp[0].all(), self.experience01.state.all())
        self.assertEqual(exp[1], self.experience01.action)
        self.assertEqual(exp[2], reward_gt)
        self.assertEqual(exp[3], self.experience02.done)
        self.assertEqual(exp[4].all(), self.experience02.new_state.all())
Ejemplo n.º 4
0
    def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = gym.make("CartPole-v0")
        self.n_step = 2
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step)

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
Ejemplo n.º 5
0
class TestNStepExperienceSource(TestCase):

    def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = gym.make("CartPole-v0")
        self.n_step = 2
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step)

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)

    def test_step(self):
        self.assertEqual(len(self.source.n_step_buffer), 0)
        exp, reward, done = self.source.step()
        self.assertEqual(len(exp), 5)
        self.assertEqual(len(self.source.n_step_buffer), self.n_step)

    def test_multi_step(self):
        self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock()))
        self.source.n_step_buffer.append(self.experience01)
        self.source.n_step_buffer.append(self.experience01)

        exp, reward, done = self.source.step()

        next_state = exp[4]
        self.assertEqual(next_state.all(), self.next_state_02.all())

    def test_discounted_transition(self):
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3)

        self.source.n_step_buffer.append(self.experience01)
        self.source.n_step_buffer.append(self.experience02)
        self.source.n_step_buffer.append(self.experience03)

        reward, next_state, done = self.source.get_transition_info()

        reward_01 = self.experience02.reward + 0.9 * self.experience03.reward * (1 - done)
        reward_gt = self.experience01.reward + 0.9 * reward_01 * (1 - done)

        self.assertEqual(reward, reward_gt)
        self.assertEqual(next_state.all(), self.next_state_02.all())
        self.assertEqual(self.experience03.done, done)

    def test_multi_step_discount(self):
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3)
        self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock()))

        self.source.n_step_buffer.append(self.experience01)
        self.source.n_step_buffer.append(self.experience02)

        reward_gt = 1.71

        exp, reward, done = self.source.step()

        self.assertEqual(exp[0].all(), self.experience01.state.all())
        self.assertEqual(exp[1], self.experience01.action)
        self.assertEqual(exp[2], reward_gt)
        self.assertEqual(exp[3], self.experience02.done)
        self.assertEqual(exp[4].all(), self.experience02.new_state.all())