Esempio n. 1
0
 def __init__(self, env, logger, device="cpu"):
     self.out_dim = out_dim = env.action_space.n
     self.device = device
     self.logger = logger
     self.hyperparameters = hyperparameters = default_hyperparameters
     self.model = hyperparameters['model_constructor'](env).to(device)
     writer = DummyWriter()
     self.epsilon = LinearScheduler(
         self.hyperparameters['initial_exploration'],
         self.hyperparameters['final_exploration'],
         self.hyperparameters['replay_start_size'],
         self.hyperparameters['final_exploration_step'] -
         self.hyperparameters['replay_start_size'],
         name="exploration",
         writer=writer)
Esempio n. 2
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters['n_envs']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer
        )

        policy = ParallelGreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"],
                name="exploration",
                writer=writer
            )
        )

        return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
    def _vqn(envs, writer=DummyWriter()):
        action_repeat = 4
        final_exploration_timestep = final_exploration_frame / action_repeat

        env = envs[0]
        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(
            model,
            optimizer,
            writer=writer
        )
        policy = ParallelGreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                0,
                final_exploration_timestep,
                name="epsilon",
                writer=writer
            )
        )
        return DeepmindAtariBody(
            VQN(q, policy, discount_factor=discount_factor),
        )
Esempio n. 4
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QNetwork(self.model,
                     optimizer,
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return DQN(
            q,
            policy,
            replay_buffer,
            discount_factor=self.hyperparameters['discount_factor'],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
        )
Esempio n. 5
0
 def _ddqn(env, writer=DummyWriter()):
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  target=FixedTarget(target_update_frequency),
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                             alpha=alpha,
                                             beta=beta,
                                             device=device)
     return DDQN(q,
                 policy,
                 replay_buffer,
                 discount_factor=discount_factor,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                 minibatch_size=minibatch_size)
 def _dqn(env, writer=DummyWriter()):
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
Esempio n. 7
0
 def _c51(env, writer=DummyWriter()):
     model = fc_relu_dist_q(env, atoms=atoms).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return C51(q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_frame,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer)
 def _vqn(envs, writer=DummyWriter()):
     env = envs[0]
     model = nature_ddqn(env).to(device)
     optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps)
     q = QNetwork(
         model,
         optimizer,
         env.action_space.n,
         loss=smooth_l1_loss,
         writer=writer
     )
     policy = GreedyPolicy(
         q,
         env.action_space.n,
         epsilon=LinearScheduler(
             initial_exploration,
             final_exploration,
             0,
             final_exploration_frame,
             name="epsilon",
             writer=writer
         )
     )
     return DeepmindAtariBody(
         VQN(q, policy, gamma=discount_factor),
     )
Esempio n. 9
0
        def agent_constructor(writer):
            policy = GreedyPolicy(
                q,
                n_actions,
                epsilon=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_step - replay_start_size,
                    name="epsilon",
                    writer=writer
                )
            )

            return DeepmindAtariBody(
                DQN(
                    q,
                    policy,
                    replay_buffer,
                    discount_factor=discount_factor,
                    loss=smooth_l1_loss,
                    minibatch_size=minibatch_size,
                    replay_start_size=replay_start_size,
                    update_frequency=update_frequency,
                ),
                lazy_frames=True
            )
Esempio n. 10
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return C51(q,
                   replay_buffer,
                   exploration=LinearScheduler(
                       self.hyperparameters['initial_exploration'],
                       self.hyperparameters['final_exploration'],
                       0,
                       self.hyperparameters["final_exploration_step"] -
                       self.hyperparameters["replay_start_size"],
                       name="epsilon",
                       writer=writer,
                   ),
                   discount_factor=self.hyperparameters["discount_factor"],
                   minibatch_size=self.hyperparameters["minibatch_size"],
                   replay_start_size=self.hyperparameters["replay_start_size"],
                   update_frequency=self.hyperparameters["update_frequency"],
                   writer=writer)
 def _dqn(env, writer=DummyWriter()):
     _model = nature_dqn(env).to(device)
     _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(
         DQN(
             q,
             policy,
             replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
         ), )
Esempio n. 12
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        # Update epoch * minibatches times per update,
        # but we only update once per n_steps,
        # with n_envs and 4 frames per step
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs * 4)

        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
Esempio n. 13
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, n_updates),
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(value_optimizer, n_updates),
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer, n_updates),
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters["clip_initial"],
                                        self.hyperparameters["clip_final"],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters["epochs"],
                minibatches=self.hyperparameters["minibatches"],
                n_envs=self.hyperparameters["n_envs"],
                n_steps=self.hyperparameters["n_steps"],
                discount_factor=self.hyperparameters["discount_factor"],
                lam=self.hyperparameters["lam"],
                entropy_loss_scaling=self.
                hyperparameters["entropy_loss_scaling"],
                writer=writer,
            ))
Esempio n. 14
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs)

        value_model = value_head().to(device)
        policy_model = policy_head(env).to(device)
        feature_model = conv_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return FrameStack(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer),
                writer=writer
            ),
            size=4
        )
Esempio n. 15
0
    def _ppo(envs, writer=DummyWriter()):
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs)
        env = envs[0]

        feature_model, value_model, policy_model = fc_actor_critic(env)
        feature_model.to(device)
        value_model.to(device)
        policy_model.to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = GaussianPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
Esempio n. 16
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters['lr'],
                               eps=self.hyperparameters['eps'])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters['lr'],
                                eps=self.hyperparameters['eps'])

        features = Identity(self.device)

        v = VNetwork(
            self.value_model,
            value_optimizer,
            loss_scaling=self.hyperparameters['value_loss_scaling'],
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
        )

        policy = GaussianPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters['clip_initial'],
                                        self.hyperparameters['clip_final'],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters['epochs'],
                minibatches=self.hyperparameters['minibatches'],
                n_envs=self.hyperparameters['n_envs'],
                n_steps=self.hyperparameters['n_steps'],
                discount_factor=self.hyperparameters['discount_factor'],
                lam=self.hyperparameters['lam'],
                entropy_loss_scaling=self.
                hyperparameters['entropy_loss_scaling'],
                writer=writer,
            ))
Esempio n. 17
0
    def _dqn(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = nature_dqn(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                loss=smooth_l1_loss,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
            ),
            lazy_frames=True
        )
Esempio n. 18
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )
        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )
        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )
        
        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(
                replay_buffer_size,
                alpha=alpha,
                beta=beta,
                device=device
            )
        else:
            replay_buffer = ExperienceReplayBuffer(
                replay_buffer_size,
                device=device
            )

        return DDQN(q, policy, replay_buffer,
                 loss=weighted_smooth_l1_loss,
                 discount_factor=discount_factor,
                 minibatch_size=minibatch_size,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                )
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']
                     ) / self.hyperparameters['update_frequency']

        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'],
                         eps=self.hyperparameters['eps'])

        q_dist = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = NStepReplayBuffer(
            self.hyperparameters['n_steps'],
            self.hyperparameters['discount_factor'],
            PrioritizedReplayBuffer(self.hyperparameters['replay_buffer_size'],
                                    alpha=self.hyperparameters['alpha'],
                                    beta=self.hyperparameters['beta'],
                                    device=self.device))

        return DeepmindAtariBody(Rainbow(
            q_dist,
            replay_buffer,
            exploration=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                train_steps - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer),
            discount_factor=self.hyperparameters['discount_factor']**
            self.hyperparameters["n_steps"],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
            writer=writer,
        ),
                                 lazy_frames=True,
                                 episodic_lives=True)
Esempio n. 20
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=self.hyperparameters['discount_factor'],
                loss=smooth_l1_loss,
                minibatch_size=self.hyperparameters['minibatch_size'],
                replay_start_size=self.hyperparameters['replay_start_size'],
                update_frequency=self.hyperparameters['update_frequency'],
            ),
            lazy_frames=True
        )
Esempio n. 21
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    self.hyperparameters['initial_exploration'],
                    self.hyperparameters['final_exploration'],
                    0,
                    self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"],
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=self.hyperparameters["discount_factor"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                update_frequency=self.hyperparameters["update_frequency"],
                writer=writer
            ),
            lazy_frames=True,
            episodic_lives=True
        )
Esempio n. 22
0
    def _c51(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = nature_c51(env, atoms=atoms).to(device)
        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            scheduler=CosineAnnealingLR(optimizer, last_update),
            writer=writer,
        )
        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )
        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    0,
                    last_timestep,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer
            ),
            lazy_frames=True
        )
    def _rainbow(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = model_constructor(env, atoms=atoms, sigma=sigma).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            writer=writer,
        )
        replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                alpha=alpha,
                                                beta=beta,
                                                device=device)
        replay_buffer = NStepReplayBuffer(n_steps, discount_factor,
                                          replay_buffer)

        agent = Rainbow(
            q,
            replay_buffer,
            exploration=LinearScheduler(initial_exploration,
                                        final_exploration,
                                        0,
                                        last_timestep,
                                        name='exploration',
                                        writer=writer),
            discount_factor=discount_factor**n_steps,
            minibatch_size=minibatch_size,
            replay_start_size=replay_start_size,
            update_frequency=update_frequency,
            writer=writer,
        )
        return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
Esempio n. 24
0
class DQNPolicy:
    def __init__(self, env, logger, device="cpu"):
        self.out_dim = out_dim = env.action_space.n
        self.device = device
        self.logger = logger
        self.hyperparameters = hyperparameters = default_hyperparameters
        self.model = hyperparameters['model_constructor'](env).to(device)
        writer = DummyWriter()
        self.epsilon = LinearScheduler(
            self.hyperparameters['initial_exploration'],
            self.hyperparameters['final_exploration'],
            self.hyperparameters['replay_start_size'],
            self.hyperparameters['final_exploration_step'] -
            self.hyperparameters['replay_start_size'],
            name="exploration",
            writer=writer)

    def calc_action(self, observations):
        with torch.no_grad():
            observations = torch.tensor(observations, device=self.device)
            greedy = torch.argmax(self.model(observations),
                                  axis=1)  #.detach().cpu().numpy()
            rand_vals = torch.randint(self.out_dim,
                                      size=(len(observations), ),
                                      device=self.device)
            epsilon = self.epsilon._get_value()
            self.logger.record("epsilon", epsilon)
            pick_rand = torch.rand(
                (len(observations), ), device=self.device) < epsilon
            actions = torch.where(pick_rand, rand_vals, greedy)
        return actions.cpu().detach().numpy()

    def get_params(self):
        return [
            param.cpu().detach().numpy() for param in self.model.parameters()
        ]

    def set_params(self, params):
        for source, dest in zip(params, self.model.parameters()):
            dest.data = torch.tensor(source, device=self.device)
 def _c51(env, writer=DummyWriter()):
     model = nature_c51(env, atoms=51).to(device)
     optimizer = Adam(
         model.parameters(),
         lr=lr,
         eps=eps
     )
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         target=FixedTarget(target_update_frequency),
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(
         replay_buffer_size,
         device=device
     )
     return DeepmindAtariBody(
         C51(
             q,
             replay_buffer,
             exploration=LinearScheduler(
                 initial_exploration,
                 final_exploration,
                 replay_start_size,
                 final_exploration_frame,
                 name="epsilon",
                 writer=writer,
             ),
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
             writer=writer
         )
     )
Esempio n. 26
0
 def agent_constructor(writer):
     return DeepmindAtariBody(
         Rainbow(
             q_dist,
             replay_buffer,
             exploration=LinearScheduler(
                 self.hyperparameters['initial_exploration'],
                 self.hyperparameters['final_exploration'],
                 0,
                 train_steps - self.hyperparameters['replay_start_size'],
                 name="exploration",
                 writer=writer
             ),
             discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"],
             minibatch_size=self.hyperparameters['minibatch_size'],
             replay_start_size=self.hyperparameters['replay_start_size'],
             update_frequency=self.hyperparameters['update_frequency'],
             writer=writer,
         ),
         lazy_frames=True,
         episodic_lives=True
     )
Esempio n. 27
0
 def test_linear_scheduler(self):
     obj = Obj()
     obj.attr = LinearScheduler(10, 0, 3, 13)
     expected = [10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0]
     actual = [obj.attr for _ in expected]
     np.testing.assert_allclose(actual, expected)
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(
            feature_model.parameters(), lr=lr, eps=eps
        )
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(
                    clip_initial,
                    clip_final,
                    0,
                    final_anneal_step,
                    name='clip',
                    writer=writer
                ),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
            )
        )