class TestExperienceReplayBuffer(unittest.TestCase):
    def setUp(self):
        np.random.seed(1)
        random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = ExperienceReplayBuffer(5)

    def test_run(self):
        states = torch.arange(0, 20)
        actions = torch.arange(0, 20)
        rewards = torch.arange(0, 20)
        expected_samples = torch.tensor([[0, 0, 0], [1, 1, 0], [0, 1, 1],
                                         [3, 0, 0], [1, 4, 4], [1, 2, 4],
                                         [2, 4, 3], [4, 7, 4], [7, 4, 6],
                                         [6, 5, 6]])
        expected_weights = np.ones((10, 3))
        actual_samples = []
        actual_weights = []
        for i in range(10):
            state = State(states[i].unsqueeze(0), torch.tensor([1]))
            next_state = State(states[i + 1].unsqueeze(0), torch.tensor([1]))
            self.replay_buffer.store(state, actions[i], rewards[i], next_state)
            sample = self.replay_buffer.sample(3)
            actual_samples.append(sample[0].features)
            actual_weights.append(sample[-1])
        tt.assert_equal(
            torch.cat(actual_samples).view(expected_samples.shape),
            expected_samples)
        np.testing.assert_array_equal(expected_weights,
                                      np.vstack(actual_weights))
Esempio n. 2
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return C51(q,
                   replay_buffer,
                   exploration=LinearScheduler(
                       self.hyperparameters['initial_exploration'],
                       self.hyperparameters['final_exploration'],
                       0,
                       self.hyperparameters["final_exploration_step"] -
                       self.hyperparameters["replay_start_size"],
                       name="epsilon",
                       writer=writer,
                   ),
                   discount_factor=self.hyperparameters["discount_factor"],
                   minibatch_size=self.hyperparameters["minibatch_size"],
                   replay_start_size=self.hyperparameters["replay_start_size"],
                   update_frequency=self.hyperparameters["update_frequency"],
                   writer=writer)
 def _dqn(env, writer=DummyWriter()):
     _model = nature_dqn(env).to(device)
     _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(
         DQN(
             q,
             policy,
             replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
         ), )
 def _dqn(env, writer=DummyWriter()):
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
Esempio n. 5
0
 def _c51(env, writer=DummyWriter()):
     model = fc_relu_dist_q(env, atoms=atoms).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return C51(q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_frame,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer)
Esempio n. 6
0
    def test_store_device(self):
        if torch.cuda.is_available():
            self.replay_buffer = ExperienceReplayBuffer(5,
                                                        device='cuda',
                                                        store_device='cpu')

            states = torch.arange(0, 20).to('cuda')
            actions = torch.arange(0, 20).view((-1, 1)).to('cuda')
            rewards = torch.arange(0, 20).to('cuda')
            state = State(states[0])
            next_state = State(states[1], reward=rewards[1])
            self.replay_buffer.store(state, actions[0], next_state)
            sample = self.replay_buffer.sample(3)
            self.assertEqual(sample[0].device, torch.device('cuda'))
            self.assertEqual(self.replay_buffer.buffer[0][0].device,
                             torch.device('cpu'))
Esempio n. 7
0
 def _dqn(env, writer=DummyWriter()):
     _model = model
     _optimizer = optimizer
     if _model is None:
         _model = conv_net(env, frames=agent_history_length).to(device)
     if _optimizer is None:
         _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target_update_frequency=target_update_frequency,
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           annealing_start=replay_start_size,
                           annealing_time=final_exploration_frame -
                           replay_start_size,
                           initial_epsilon=initial_exploration,
                           final_epsilon=final_exploration)
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(DQN(
         q,
         policy,
         replay_buffer,
         discount_factor=discount_factor,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size,
         update_frequency=update_frequency,
     ),
                              env,
                              action_repeat=action_repeat,
                              frame_stack=agent_history_length,
                              noop_max=noop_max)
Esempio n. 8
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, last_update),
                     target=FixedTarget(target_update_frequency),
                     writer=writer)
        policy = SharedAutonomyPolicy(q,
                                      env.action_space.n,
                                      epsilon=0,
                                      pilot_tol=pilot_tol)

        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                    alpha=alpha,
                                                    beta=beta,
                                                    device=device)
        else:
            replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                                   device=device)

        return co_DDQN(q,
                       policy,
                       replay_buffer,
                       loss=weighted_smooth_l1_loss,
                       discount_factor=discount_factor,
                       minibatch_size=minibatch_size,
                       replay_start_size=replay_start_size,
                       update_frequency=update_frequency)
Esempio n. 9
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QNetwork(self.model,
                     optimizer,
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return DQN(
            q,
            policy,
            replay_buffer,
            discount_factor=self.hyperparameters['discount_factor'],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
        )
Esempio n. 10
0
 def _model_predictive_dqn(env, writer=None):
     # models
     feature_model = shared_feature_layers().to(device)
     value_model = value_head().to(device)
     reward_model = reward_head(env).to(device)
     generator_model = Generator(env).to(device)
     # optimizers
     feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
     value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
     reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps)
     generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps)
     # approximators
     f = FeatureNetwork(feature_model, feature_optimizer, writer=writer)
     v = VNetwork(value_model, value_optimizer, writer=writer)
     r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer)
     g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer)
     # replay buffer
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device)
     # create agent
     agent = ModelPredictiveDQN(f, v, r, g, replay_buffer,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size
     )
     # apply agent wrappers for better atari performance
     return DeepmindAtariBody(agent, lazy_frames=True)
Esempio n. 11
0
    def _ddpg(env, writer=DummyWriter()):
        value_model = fc_value(env).to(device)
        value_optimizer = Adam(value_model.parameters(), lr=lr_q)
        q = QContinuous(value_model,
                        value_optimizer,
                        target=PolyakTarget(polyak_rate),
                        writer=writer)

        policy_model = fc_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     noise,
                                     target=PolyakTarget(polyak_rate),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return DDPG(q,
                    policy,
                    replay_buffer,
                    replay_start_size=replay_start_size,
                    discount_factor=discount_factor,
                    update_frequency=update_frequency,
                    minibatch_size=minibatch_size)
Esempio n. 12
0
    def _dqn(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = nature_dqn(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                loss=smooth_l1_loss,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
            ),
            lazy_frames=True
        )
Esempio n. 13
0
    def test_run(self):
        np.random.seed(1)
        random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = ExperienceReplayBuffer(5)

        states = torch.arange(0, 20)
        actions = torch.arange(0, 20).view((-1, 1))
        rewards = torch.arange(0, 20)
        expected_samples = torch.tensor([
            [0, 0, 0],
            [1, 1, 0],
            [0, 1, 1],
            [3, 0, 0],
            [1, 4, 4],
            [1, 2, 4],
            [2, 4, 3],
            [4, 7, 4],
            [7, 4, 6],
            [6, 5, 6],
        ])
        expected_weights = np.ones((10, 3))
        actual_samples = []
        actual_weights = []
        for i in range(10):
            state = State(states[i])
            next_state = State(states[i + 1], reward=rewards[i])
            self.replay_buffer.store(state, actions[i], next_state)
            sample = self.replay_buffer.sample(3)
            actual_samples.append(sample[0].observation)
            actual_weights.append(sample[-1])
        tt.assert_equal(
            torch.cat(actual_samples).view(expected_samples.shape),
            expected_samples)
        np.testing.assert_array_equal(expected_weights,
                                      np.vstack(actual_weights))
Esempio n. 14
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=self.hyperparameters['discount_factor'],
                loss=smooth_l1_loss,
                minibatch_size=self.hyperparameters['minibatch_size'],
                replay_start_size=self.hyperparameters['replay_start_size'],
                update_frequency=self.hyperparameters['update_frequency'],
            ),
            lazy_frames=True
        )
Esempio n. 15
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    self.hyperparameters['initial_exploration'],
                    self.hyperparameters['final_exploration'],
                    0,
                    self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"],
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=self.hyperparameters["discount_factor"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                update_frequency=self.hyperparameters["update_frequency"],
                writer=writer
            ),
            lazy_frames=True,
            episodic_lives=True
        )
Esempio n. 16
0
class TestExperienceReplayBuffer(unittest.TestCase):
    def test_run(self):
        np.random.seed(1)
        random.seed(1)
        torch.manual_seed(1)
        self.replay_buffer = ExperienceReplayBuffer(5)

        states = torch.arange(0, 20)
        actions = torch.arange(0, 20).view((-1, 1))
        rewards = torch.arange(0, 20)
        expected_samples = torch.tensor([
            [0, 0, 0],
            [1, 1, 0],
            [0, 1, 1],
            [3, 0, 0],
            [1, 4, 4],
            [1, 2, 4],
            [2, 4, 3],
            [4, 7, 4],
            [7, 4, 6],
            [6, 5, 6],
        ])
        expected_weights = np.ones((10, 3))
        actual_samples = []
        actual_weights = []
        for i in range(10):
            state = State(states[i])
            next_state = State(states[i + 1], reward=rewards[i])
            self.replay_buffer.store(state, actions[i], next_state)
            sample = self.replay_buffer.sample(3)
            actual_samples.append(sample[0].observation)
            actual_weights.append(sample[-1])
        tt.assert_equal(
            torch.cat(actual_samples).view(expected_samples.shape),
            expected_samples)
        np.testing.assert_array_equal(expected_weights,
                                      np.vstack(actual_weights))

    def test_store_device(self):
        if torch.cuda.is_available():
            self.replay_buffer = ExperienceReplayBuffer(5,
                                                        device='cuda',
                                                        store_device='cpu')

            states = torch.arange(0, 20).to('cuda')
            actions = torch.arange(0, 20).view((-1, 1)).to('cuda')
            rewards = torch.arange(0, 20).to('cuda')
            state = State(states[0])
            next_state = State(states[1], reward=rewards[1])
            self.replay_buffer.store(state, actions[0], next_state)
            sample = self.replay_buffer.sample(3)
            self.assertEqual(sample[0].device, torch.device('cuda'))
            self.assertEqual(self.replay_buffer.buffer[0][0].device,
                             torch.device('cpu'))
Esempio n. 17
0
    def _c51(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = nature_c51(env, atoms=atoms).to(device)
        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            scheduler=CosineAnnealingLR(optimizer, last_update),
            writer=writer,
        )
        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )
        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    0,
                    last_timestep,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer
            ),
            lazy_frames=True
        )
Esempio n. 18
0
    def _online_cacla(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        # feature_model = models.features(env.state_space.shape[0]).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        features = None  # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False)
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return OnlineCACLA(features,
                           v,
                           policy,
                           replay_buffer,
                           env.action_space,
                           log=log,
                           writer=writer,
                           discount_factor=discount_factor)
Esempio n. 19
0
    def _sac(env, writer=DummyWriter()):
        q_1_model = fc_q(env).to(device)
        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuous(q_1_model, q_1_optimizer, writer=writer, name='q_1')

        q_2_model = fc_q(env).to(device)
        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuous(q_2_model, q_2_optimizer, writer=writer, name='q_2')

        v_model = fc_v(env).to(device)
        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetwork(
            v_model,
            v_optimizer,
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_model = fc_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = SoftDeterministicPolicy(policy_model,
                                         policy_optimizer,
                                         env.action_space,
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return SAC(policy,
                   q_1,
                   q_2,
                   v,
                   replay_buffer,
                   entropy_target=(-env.action_space.shape[0] *
                                   entropy_target_scaling),
                   lr_temperature=lr_temperature,
                   replay_start_size=replay_start_size,
                   discount_factor=discount_factor,
                   update_frequency=update_frequency,
                   minibatch_size=minibatch_size,
                   writer=writer)
Esempio n. 20
0
    def _fac(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return ForwardAC(v,
                         policy,
                         replay_buffer,
                         env.action_space,
                         log=log,
                         trace_decay=trace_decay,
                         writer=writer,
                         discount_factor=discount_factor)
 def _c51(env, writer=DummyWriter()):
     model = nature_c51(env, atoms=51).to(device)
     optimizer = Adam(
         model.parameters(),
         lr=lr,
         eps=eps
     )
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         target=FixedTarget(target_update_frequency),
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(
         replay_buffer_size,
         device=device
     )
     return DeepmindAtariBody(
         C51(
             q,
             replay_buffer,
             exploration=LinearScheduler(
                 initial_exploration,
                 final_exploration,
                 replay_start_size,
                 final_exploration_frame,
                 name="epsilon",
                 writer=writer,
             ),
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
             writer=writer
         )
     )
Esempio n. 22
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_optimizer = Adam(self.q_model.parameters(),
                           lr=self.hyperparameters["lr_q"])

        q = QContinuous(self.q_model,
                        q_optimizer,
                        target=PolyakTarget(
                            self.hyperparameters["polyak_rate"]),
                        scheduler=CosineAnnealingLR(q_optimizer, n_updates),
                        writer=writer)

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = DeterministicPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                self.action_space,
                noise=self.hyperparameters["noise"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
            ))
Esempio n. 23
0
    def _ddpg(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        q_model = fc_q(env).to(device)
        q_optimizer = Adam(q_model.parameters(), lr=lr_q)
        q = QContinuous(q_model,
                        q_optimizer,
                        target=PolyakTarget(polyak_rate),
                        scheduler=CosineAnnealingLR(q_optimizer,
                                                    final_anneal_step),
                        writer=writer)

        policy_model = fc_deterministic_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     target=PolyakTarget(polyak_rate),
                                     scheduler=CosineAnnealingLR(
                                         policy_optimizer, final_anneal_step),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                env.action_space,
                noise=noise,
                replay_start_size=replay_start_size,
                discount_factor=discount_factor,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size,
            ))
Esempio n. 24
0
 def _dqn(env, writer=DummyWriter()):
     model = build_model(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target_update_frequency=target_update_frequency,
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           initial_epsilon=initial_exploration,
                           final_epsilon=final_exploration,
                           annealing_time=final_exploration_frame)
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
Esempio n. 25
0
    def agent(self, writer=DummyWriter(), train_steps=float("inf")):
        # optimizers
        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])

        # approximators
        f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer)
        v = VNetwork(self.value_model, value_optimizer, writer=writer)
        r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer)
        g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer)

        # replay buffer
        replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device)

        # create agent
        agent = ModelBasedDQN(f, v, r, g, replay_buffer,
            minibatch_size=self.hyperparameters["minibatch_size"],
            replay_start_size=self.hyperparameters["replay_start_size"]
        )

        # apply atari wrappers for better performance
        return DeepmindAtariBody(agent, lazy_frames=True)
Esempio n. 26
0
    def _dqn(env, writers=None):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        n_agents = len(env.agents)
        n_actions = env.action_spaces['first_0'].n

        model = model_constructor(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = Approximation(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writers['first_0']
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            store_device=device,
            device=device
        )

        def agent_constructor(writer):
            policy = GreedyPolicy(
                q,
                n_actions,
                epsilon=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_step - replay_start_size,
                    name="epsilon",
                    writer=writer
                )
            )

            return DeepmindAtariBody(
                DQN(
                    q,
                    policy,
                    replay_buffer,
                    discount_factor=discount_factor,
                    loss=smooth_l1_loss,
                    minibatch_size=minibatch_size,
                    replay_start_size=replay_start_size,
                    update_frequency=update_frequency,
                ),
                lazy_frames=True
            )

        return MultiagentEncoder(IndependentMultiagent({
            agent : agent_constructor(writers[agent])
            for agent in env.agents
        }), env.agents, device)
    def _sac(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        v_model = v_model_constructor(env).to(device)
        q_1_model = q1_model_constructor(env).to(device)
        q_2_model = q2_model_constructor(env).to(device)
        #quick and dirty implementation of parallel branch un/freeze
        policy_model = policy_model_constructor(
            env=env, train_parallel=train_parallel).to(device)

        if pretrained_models is not None:
            q_1_model = pretrained_models.q_1.model.to(device)
            q_2_model = pretrained_models.q_2.model.to(device)
            v_model = pretrained_models.v.model.to(device)
            policy_model = pretrained_models.policy.model.to(device)

        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuousCtrlRep(q_1_model,
                                 q_1_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_1_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_1')

        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuousCtrlRep(q_2_model,
                                 q_2_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_2_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_2')

        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetworkCtrlRep(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(filter(lambda p: p.requires_grad,
                                       policy_model.parameters()),
                                lr=lr_pi)
        policy = SoftDeterministicPolicyCtrlRep(policy_model,
                                                policy_optimizer,
                                                env.action_space,
                                                scheduler=CosineAnnealingLR(
                                                    policy_optimizer,
                                                    final_anneal_step),
                                                target=FixedTarget(1000),
                                                writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            SACCtrlRep(policy=policy,
                       q_1=q_1,
                       q_2=q_2,
                       v=v,
                       replay_buffer=replay_buffer,
                       temperature_initial=temperature_initial,
                       entropy_target=(-env.action_space.shape[0] *
                                       entropy_target_scaling),
                       lr_temperature=lr_temperature,
                       replay_start_size=replay_start_size,
                       discount_factor=discount_factor,
                       update_frequency=update_frequency,
                       minibatch_size=minibatch_size,
                       writer=writer))
Esempio n. 28
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_1_optimizer = Adam(self.q_1_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_1 = QContinuous(self.q_1_model,
                          q_1_optimizer,
                          scheduler=CosineAnnealingLR(q_1_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_1')

        q_2_optimizer = Adam(self.q_2_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_2 = QContinuous(self.q_2_model,
                          q_2_optimizer,
                          scheduler=CosineAnnealingLR(q_2_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_2')

        v_optimizer = Adam(self.v_model.parameters(),
                           lr=self.hyperparameters["lr_v"])
        v = VNetwork(
            self.v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, n_updates),
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = SoftDeterministicPolicy(self.policy_model,
                                         policy_optimizer,
                                         self.action_space,
                                         scheduler=CosineAnnealingLR(
                                             policy_optimizer, n_updates),
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            SAC(policy,
                q_1,
                q_2,
                v,
                replay_buffer,
                temperature_initial=self.
                hyperparameters["temperature_initial"],
                entropy_target=(
                    -self.action_space.shape[0] *
                    self.hyperparameters["entropy_target_scaling"]),
                lr_temperature=self.hyperparameters["lr_temperature"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                writer=writer))
 def setUp(self):
     np.random.seed(1)
     random.seed(1)
     torch.manual_seed(1)
     self.replay_buffer = ExperienceReplayBuffer(5)
 def setUp(self):
     np.random.seed(1)
     random.seed(1)
     torch.manual_seed(1)
     self.replay_buffer = NStepReplayBuffer(4, 0.5,
                                            ExperienceReplayBuffer(100))