Esempio n. 1
0
class TestPrioritizedReplayBuffer(unittest.TestCase):
    def setUp(self):
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = PrioritizedReplayBuffer(5, 0.6)

    def test_run(self):
        states = StateArray(torch.arange(0, 20), (20,), reward=torch.arange(-1, 19).float())
        actions = torch.arange(0, 20).view((-1, 1))
        expected_samples = State(
            torch.tensor(
                [
                    [0, 1, 2],
                    [0, 1, 3],
                    [5, 5, 5],
                    [6, 6, 2],
                    [7, 7, 7],
                    [7, 8, 8],
                    [7, 7, 7],
                ]
            )
        )
        expected_weights = [
            [1.0000, 1.0000, 1.0000],
            [0.5659, 0.7036, 0.5124],
            [0.0631, 0.0631, 0.0631],
            [0.0631, 0.0631, 0.1231],
            [0.0631, 0.0631, 0.0631],
            [0.0776, 0.0631, 0.0631],
            [0.0866, 0.0866, 0.0866],
        ]
        actual_samples = []
        actual_weights = []
        for i in range(10):
            self.replay_buffer.store(states[i], actions[i], states[i + 1])
            if i > 2:
                sample = self.replay_buffer.sample(3)
                sample_states = sample[0].observation
                self.replay_buffer.update_priorities(torch.randn(3))
                actual_samples.append(sample_states)
                actual_weights.append(sample[-1])

        actual_samples = State(torch.cat(actual_samples).view((-1, 3)))
        self.assert_states_equal(actual_samples, expected_samples)
        np.testing.assert_array_almost_equal(
            expected_weights, np.vstack(actual_weights), decimal=3
        )

    def assert_states_equal(self, actual, expected):
        tt.assert_almost_equal(actual.observation, expected.observation)
        self.assertEqual(actual.mask, expected.mask)
 def _ddqn(env, writer=DummyWriter()):
     model = dueling_fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  target=FixedTarget(target_update_frequency),
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                             alpha=alpha,
                                             beta=beta,
                                             device=device)
     return DDQN(q,
                 policy,
                 replay_buffer,
                 discount_factor=discount_factor,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                 minibatch_size=minibatch_size)
Esempio n. 3
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, last_update),
                     target=FixedTarget(target_update_frequency),
                     writer=writer)
        policy = SharedAutonomyPolicy(q,
                                      env.action_space.n,
                                      epsilon=0,
                                      pilot_tol=pilot_tol)

        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                    alpha=alpha,
                                                    beta=beta,
                                                    device=device)
        else:
            replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                                   device=device)

        return co_DDQN(q,
                       policy,
                       replay_buffer,
                       loss=weighted_smooth_l1_loss,
                       discount_factor=discount_factor,
                       minibatch_size=minibatch_size,
                       replay_start_size=replay_start_size,
                       update_frequency=update_frequency)
Esempio n. 4
0
 def _rainbow(env, writer=DummyWriter()):
     model = model_constructor(env, atoms=atoms, sigma=sigma).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         writer=writer,
     )
     # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device)
     replay_buffer = PrioritizedReplayBuffer(
         replay_buffer_size,
         alpha=alpha,
         beta=beta,
         device=device
     )
     replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer)
     return Rainbow(
         q,
         replay_buffer,
         exploration=0.,
         discount_factor=discount_factor ** n_steps,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size,
         update_frequency=update_frequency,
         writer=writer,
     )
Esempio n. 5
0
 def _rainbow(env, writer=DummyWriter()):
     model = build_model(env, sigma_init).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(
         model,
         optimizer,
         env.action_space.n,
         target_update_frequency=target_update_frequency,
         loss=mse_loss,
         writer=writer
     )
     policy = GreedyPolicy(
         q,
         env.action_space.n,
         initial_epsilon=1,
         final_epsilon=0,
         annealing_start=replay_start_size,
         annealing_time=1
     )
     # replay_buffer = ExperienceReplayBuffer(replay_buffer_size)
     replay_buffer = PrioritizedReplayBuffer(
         replay_buffer_size,
         alpha=alpha,
         beta=beta,
         final_beta_frame=final_beta_frame,
         device=device
     )
     return DQN(q, policy, replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
class TestPrioritizedReplayBuffer(unittest.TestCase):
    def setUp(self):
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = PrioritizedReplayBuffer(5, 0.6)

    def test_run(self):
        states = State(torch.arange(0, 20))
        actions = torch.arange(0, 20)
        rewards = torch.arange(0, 20)
        expected_samples = State(
            torch.tensor([
                [0, 2, 2],
                [0, 1, 1],
                [3, 3, 5],
                [5, 3, 6],
                [3, 5, 7],
                [8, 5, 8],
                [8, 5, 5],
            ]))
        expected_weights = [[1., 1., 1.], [0.56589746, 0.5124394, 0.5124394],
                            [0.5124343, 0.5124343, 0.5124343],
                            [0.5090894, 0.6456939, 0.46323255],
                            [0.51945686, 0.5801515, 0.45691562],
                            [0.45691025, 0.5096957, 0.45691025],
                            [0.5938914, 0.6220026, 0.6220026]]
        actual_samples = []
        actual_weights = []
        for i in range(10):
            self.replay_buffer.store(states[i], actions[i], rewards[i],
                                     states[i + 1])
            if i > 2:
                sample = self.replay_buffer.sample(3)
                sample_states = sample[0].features
                self.replay_buffer.update_priorities(torch.randn(3))
                actual_samples.append(sample_states)
                actual_weights.append(sample[-1])

        actual_samples = State(torch.cat(actual_samples).view((-1, 3)))
        self.assert_states_equal(actual_samples, expected_samples)
        np.testing.assert_array_almost_equal(expected_weights,
                                             np.vstack(actual_weights))

    def assert_states_equal(self, actual, expected):
        tt.assert_almost_equal(actual.raw, expected.raw)
        tt.assert_equal(actual.mask, expected.mask)
Esempio n. 7
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q_dist = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = NStepReplayBuffer(
            self.hyperparameters['n_steps'],
            self.hyperparameters['discount_factor'],
            PrioritizedReplayBuffer(
                self.hyperparameters['replay_buffer_size'],
                alpha=self.hyperparameters['alpha'],
                beta=self.hyperparameters['beta'],
                device=self.device
            )
        )
        def agent_constructor(writer):
            return DeepmindAtariBody(
                Rainbow(
                    q_dist,
                    replay_buffer,
                    exploration=LinearScheduler(
                        self.hyperparameters['initial_exploration'],
                        self.hyperparameters['final_exploration'],
                        0,
                        train_steps - self.hyperparameters['replay_start_size'],
                        name="exploration",
                        writer=writer
                    ),
                    discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"],
                    minibatch_size=self.hyperparameters['minibatch_size'],
                    replay_start_size=self.hyperparameters['replay_start_size'],
                    update_frequency=self.hyperparameters['update_frequency'],
                    writer=writer,
                ),
                lazy_frames=True,
                episodic_lives=True
            )

        return MultiagentEncoder(IndependentMultiagent({
            agent : agent_constructor(writers[agent])
            for agent in env.agents
        }), env.agents, device)
Esempio n. 8
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = nature_ddqn(env).to(device)
        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )
        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )
        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )
        replay_buffer = PrioritizedReplayBuffer(
            replay_buffer_size,
            alpha=alpha,
            beta=beta,
            device=device
        )
        return DeepmindAtariBody(
            DDQN(q, policy, replay_buffer,
                 loss=weighted_smooth_l1_loss,
                 discount_factor=discount_factor,
                 minibatch_size=minibatch_size,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                ),
            lazy_frames=True
        )
 def _rainbow(env, writer=DummyWriter()):
     _model = model
     _optimizer = optimizer
     if _model is None:
         _model = dueling_conv_net(
             env, frames=agent_history_length).to(device)
     if _optimizer is None:
         _optimizer = Adam(
             _model.parameters(),
             lr=lr,
             eps=eps
         )
     q = QNetwork(
         _model,
         _optimizer,
         env.action_space.n,
         target_update_frequency=target_update_frequency,
         loss=smooth_l1_loss,
         writer=writer
     )
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           annealing_start=replay_start_size,
                           annealing_time=final_exploration_frame - replay_start_size,
                           initial_epsilon=initial_exploration,
                           final_epsilon=final_exploration
                           )
     replay_buffer = PrioritizedReplayBuffer(
         replay_buffer_size,
         alpha=alpha,
         beta=beta,
         final_beta_frame=final_beta_frame,
         device=device
     )
     return DeepmindAtariBody(
         DQN(q, policy, replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
             ),
         env,
         action_repeat=action_repeat,
         frame_stack=agent_history_length,
         noop_max=noop_max
     )
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q_dist = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = NStepReplayBuffer(
            self.hyperparameters['n_steps'],
            self.hyperparameters['discount_factor'],
            PrioritizedReplayBuffer(
                self.hyperparameters['replay_buffer_size'],
                alpha=self.hyperparameters['alpha'],
                beta=self.hyperparameters['beta'],
                device=self.device
            )
        )

        return Rainbow(
            q_dist,
            replay_buffer,
            exploration=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                train_steps - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            ),
            discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
            writer=writer,
        )
Esempio n. 11
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']
                     ) / self.hyperparameters['update_frequency']

        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'],
                         eps=self.hyperparameters['eps'])

        q = QNetwork(self.model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, n_updates),
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = PrioritizedReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            alpha=self.hyperparameters['alpha'],
            beta=self.hyperparameters['beta'],
            device=self.device)

        return DeepmindAtariBody(DDQN(
            q,
            policy,
            replay_buffer,
            loss=weighted_smooth_l1_loss,
            discount_factor=self.hyperparameters["discount_factor"],
            minibatch_size=self.hyperparameters["minibatch_size"],
            replay_start_size=self.hyperparameters["replay_start_size"],
            update_frequency=self.hyperparameters["update_frequency"],
        ),
                                 lazy_frames=True)
    def _rainbow(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = model_constructor(env, atoms=atoms, sigma=sigma).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            writer=writer,
        )
        replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                alpha=alpha,
                                                beta=beta,
                                                device=device)
        replay_buffer = NStepReplayBuffer(n_steps, discount_factor,
                                          replay_buffer)

        agent = Rainbow(
            q,
            replay_buffer,
            exploration=LinearScheduler(initial_exploration,
                                        final_exploration,
                                        0,
                                        last_timestep,
                                        name='exploration',
                                        writer=writer),
            discount_factor=discount_factor**n_steps,
            minibatch_size=minibatch_size,
            replay_start_size=replay_start_size,
            update_frequency=update_frequency,
            writer=writer,
        )
        return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
 def setUp(self):
     random.seed(1)
     np.random.seed(1)
     torch.manual_seed(1)
     self.replay_buffer = PrioritizedReplayBuffer(5, 0.6)