def setUp(self) -> None:
        self.gamma = 0.9
        self.buffer = MultiStepBuffer(capacity=10, n_steps=2, gamma=self.gamma)

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
    def setUp(self) -> None:
        self.buffer = PERBuffer(10)

        self.state = np.random.rand(32, 32)
        self.next_state = np.random.rand(32, 32)
        self.action = np.ones([1])
        self.reward = np.ones([1])
        self.done = np.zeros([1])
        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
    def train_batch(
        self,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Contains the logic for generating a new batch of data to be passed to the DataLoader
        Returns:
            yields a Experience tuple containing the state, action, reward, done and next_state.
        """

        episode_reward = 0
        episode_steps = 0

        while True:
            self.total_steps += 1
            action = self.agent(self.state, self.device)

            next_state, r, is_done, _ = self.env.step(action[0])

            episode_reward += r
            episode_steps += 1

            exp = Experience(
                state=self.state,
                action=action[0],
                reward=r,
                done=is_done,
                new_state=next_state,
            )

            self.agent.update_epsilon(self.global_step)
            self.buffer.append(exp)
            self.state = next_state

            if is_done:
                self.done_episodes += 1
                self.total_rewards.append(episode_reward)
                self.total_episode_steps.append(episode_steps)
                self.avg_rewards = float(
                    np.mean(self.total_rewards[-self.avg_reward_len:])
                )
                self.state = self.env.reset()
                episode_steps = 0
                episode_reward = 0

            samples, indices, weights = self.buffer.sample(self.batch_size)

            states, actions, rewards, dones, new_states = samples

            for idx, _ in enumerate(dones):
                yield (
                    states[idx],
                    actions[idx],
                    rewards[idx],
                    dones[idx],
                    new_states[idx],
                ), indices[idx], weights[idx]
Esempio n. 4
0
    def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = gym.make("CartPole-v0")
        self.n_step = 2
        self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step)

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
Esempio n. 5
0
    def setUp(self) -> None:
        self.buffer = MultiStepBuffer(buffer_size=10, n_step=2)

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward,
                                       self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02,
                                       self.reward_02, self.done_02,
                                       self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02,
                                       self.reward_02, self.done_02,
                                       self.next_state_02)
    def setUp(self) -> None:
        self.state = np.random.rand(32, 32)
        self.next_state = np.random.rand(32, 32)
        self.action = np.ones([1])
        self.reward = np.ones([1])
        self.done = np.zeros([1])
        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)

        self.source = Mock()
        self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False))
        self.warm_start = 10
        self.buffer = ReplayBuffer(20)
        for _ in range(self.warm_start):
            self.buffer.append(self.experience)
    def setUp(self) -> None:
        self.state = np.random.rand(4, 84, 84)
        self.next_state = np.random.rand(4, 84, 84)
        self.action = np.ones([1])
        self.reward = np.ones([1])
        self.done = np.zeros([1])
        self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
        self.source = Mock()
        self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False))
        self.batch_size = 8
        self.buffer = Buffer(8)

        for _ in range(self.batch_size):
            self.buffer.append(self.experience)
    def step(self, device: torch.device) -> Tuple[Experience, float, bool]:
        """Takes a single step through the environment"""
        action = self.agent(self.state, device)
        new_state, reward, done, _ = self.env.step(action)
        experience = Experience(
            state=self.state,
            action=action,
            reward=reward,
            new_state=new_state,
            done=done,
        )
        self.state = new_state

        if done:
            self.state = self.env.reset()

        return experience, reward, done
    def step(self, device: torch.device) -> Experience:
        """Carries out a single step in the environment"""
        action = self.agent(self.state, device)
        new_state, reward, done, _ = self.env.step(action)
        experience = Experience(
            state=self.state,
            action=action,
            reward=reward,
            new_state=new_state,
            done=done,
        )
        self.state = new_state

        if done:
            self.state = self.env.reset()

        return experience
    def step(self, device: torch.device) -> Tuple[Experience, float, bool]:
        """
        Takes an n-step in the environment

        Returns:
            Experience
        """
        exp = self.single_step(device)

        while len(self.n_step_buffer) < self.n_steps:
            self.single_step(device)

        reward, next_state, done = self.get_transition_info()
        first_experience = self.n_step_buffer[0]
        multi_step_experience = Experience(first_experience.state,
                                           first_experience.action, reward,
                                           done, next_state)

        return multi_step_experience, exp.reward, exp.done
Esempio n. 11
0
    def test_train_batch(self):
        state = np.random.rand(4, 84, 84)
        self.source = Mock()
        exp = Experience(state=state,
                         action=0,
                         reward=5,
                         done=False,
                         new_state=state)
        self.source.step = Mock(return_value=(exp, 1, False))
        self.model.source = self.source

        xp_dataloader = self.model.train_dataloader()

        for i_batch, batch in enumerate(xp_dataloader):
            self.assertEqual(len(batch), 3)
            self.assertEqual(len(batch[0]), self.model.batch_size)
            self.assertTrue(isinstance(batch, list))
            self.assertEqual(self.model.baseline, 5)
            self.assertIsInstance(batch[0], torch.Tensor)
            self.assertIsInstance(batch[1], torch.Tensor)
            self.assertIsInstance(batch[2], torch.Tensor)
            break