Esempio n. 1
0
 def setUp(self):
     torch.manual_seed(2)
     self.model = nn.Sequential(
         nn.Linear(STATE_DIM, ACTIONS)
     )
     optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
     self.policy = SoftmaxPolicy(self.model, optimizer)
Esempio n. 2
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"])

        v = VNetwork(self.value_model,
                     value_optimizer,
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return A2C(
            features,
            v,
            policy,
            n_envs=self.hyperparameters["n_envs"],
            n_steps=self.hyperparameters["n_steps"],
            discount_factor=self.hyperparameters["discount_factor"],
            entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
            writer=writer)
Esempio n. 3
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters["min_batch_size"]

        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])

        features = FeatureNetwork(
            self.feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(feature_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        v = VNetwork(
            self.value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
            loss_scaling=self.hyperparameters["value_loss_scaling"],
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        policy = SoftmaxPolicy(
            self.policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        return DeepmindAtariBody(
            VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]),
        )
Esempio n. 4
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            writer=writer
        )
        return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
Esempio n. 5
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr_pi"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr_v"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return VPG(features,
                   v,
                   policy,
                   discount_factor=self.hyperparameters["discount_factor"],
                   min_batch_size=self.hyperparameters["min_batch_size"])
    def _vac(envs, writer=DummyWriter()):
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(envs[0]).to(device)
        feature_model = feature_model_constructor().to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)

        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
        )
        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)

        return DeepmindAtariBody(
            VAC(features, v, policy, discount_factor=discount_factor), )
Esempio n. 7
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               clip_grad=clip_grad,
                               writer=writer)
        return PPO(features,
                   v,
                   policy,
                   epsilon=epsilon,
                   epochs=epochs,
                   lam=lam,
                   minibatches=minibatches,
                   n_envs=n_envs,
                   n_steps=n_steps,
                   discount_factor=discount_factor,
                   entropy_loss_scaling=entropy_loss_scaling,
                   writer=writer)
Esempio n. 8
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model, feature_optimizer, clip_grad=clip_grad)
        v = VNetwork(
            value_model,
            value_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        return A2C(
            features,
            v,
            policy,
            n_envs=n_envs,
            n_steps=n_steps,
            discount_factor=discount_factor,
            entropy_loss_scaling=entropy_loss_scaling,
            writer=writer
        )
Esempio n. 9
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               env.action_space.n,
                               entropy_loss_scaling=entropy_loss_scaling,
                               clip_grad=clip_grad,
                               writer=writer)
        return VPG(features,
                   v,
                   policy,
                   gamma=gamma,
                   min_batch_size=min_batch_size)
Esempio n. 10
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        # Update epoch * minibatches times per update,
        # but we only update once per n_steps,
        # with n_envs and 4 frames per step
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs * 4)

        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]

        feature_model = conv_features().to(device)
        value_model = value_net().to(device)
        policy_model = policy_net(env).to(device)

        feature_optimizer = RMSprop(
            feature_model.parameters(),
            alpha=alpha,
            lr=lr * feature_lr_scaling,
            eps=eps
        )
        value_optimizer = RMSprop(
            value_model.parameters(),
            alpha=alpha,
            lr=lr,
            eps=eps
        )
        policy_optimizer = RMSprop(
            policy_model.parameters(),
            alpha=alpha,
            lr=lr,
            eps=eps
        )

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad
        )
        v = ValueNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )

        return ParallelAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
            ),
            envs,
        )
Esempio n. 12
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, n_updates),
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(value_optimizer, n_updates),
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer, n_updates),
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters["clip_initial"],
                                        self.hyperparameters["clip_final"],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters["epochs"],
                minibatches=self.hyperparameters["minibatches"],
                n_envs=self.hyperparameters["n_envs"],
                n_steps=self.hyperparameters["n_steps"],
                discount_factor=self.hyperparameters["discount_factor"],
                lam=self.hyperparameters["lam"],
                entropy_loss_scaling=self.
                hyperparameters["entropy_loss_scaling"],
                writer=writer,
            ))
Esempio n. 13
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs)

        value_model = value_head().to(device)
        policy_model = policy_head(env).to(device)
        feature_model = conv_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return FrameStack(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer),
                writer=writer
            ),
            size=4
        )
Esempio n. 14
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs * 4)

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer
            ),
        )
Esempio n. 15
0
 def _actor_critic(env, writer=DummyWriter()):
     value_model = fc_value(env).to(device)
     value_optimizer = Adam(value_model.parameters(), lr=lr_v)
     v = ValueNetwork(value_model, value_optimizer, writer=writer)
     policy_model = fc_policy(env).to(device)
     policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
     policy = SoftmaxPolicy(policy_model,
                            policy_optimizer,
                            env.action_space.n,
                            writer=writer)
     return ActorCritic(v, policy)
Esempio n. 16
0
    def _vac(env, writer=DummyWriter()):
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor(env).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)

        v = VNetwork(value_model, value_optimizer, writer=writer)
        policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer)
        features = FeatureNetwork(feature_model, feature_optimizer)

        return VAC(features, v, policy, discount_factor=discount_factor)
Esempio n. 17
0
    def _vac(env, writer=DummyWriter()):
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)
        feature_model = fc_relu_features(env).to(device)

        value_optimizer = RMSprop(value_model.parameters(), lr=lr_v, alpha=alpha, eps=eps)
        policy_optimizer = RMSprop(policy_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps)
        feature_optimizer = RMSprop(feature_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps)

        v = VNetwork(value_model, value_optimizer, writer=writer)
        policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer)
        features = FeatureNetwork(feature_model, feature_optimizer)

        return VAC(features, v, policy, gamma=discount_factor)
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = RMSprop(
            feature_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )
        value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps)
        policy_optimizer = RMSprop(
            policy_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
            ),
        )
    def _vpg_atari(env, writer=DummyWriter()):
        feature_model = nature_features().to(device)
        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)

        feature_optimizer = RMSprop(feature_model.parameters(),
                                    alpha=alpha,
                                    lr=lr * feature_lr_scaling,
                                    eps=eps)
        value_optimizer = RMSprop(value_model.parameters(),
                                  alpha=alpha,
                                  lr=lr,
                                  eps=eps)
        policy_optimizer = RMSprop(policy_model.parameters(),
                                   alpha=alpha,
                                   lr=lr,
                                   eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     loss_scaling=value_loss_scaling,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )

        return DeepmindAtariBody(
            VPG(features,
                v,
                policy,
                gamma=discount_factor,
                min_batch_size=min_batch_size), )
Esempio n. 20
0
    def _vpg_atari(env, writer=DummyWriter()):
        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer,
                                      final_anneal_step,
                                  ),
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(
                         value_optimizer,
                         final_anneal_step,
                     ),
                     loss_scaling=value_loss_scaling,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer,
                                   final_anneal_step,
                               ),
                               clip_grad=clip_grad,
                               writer=writer)

        return DeepmindAtariBody(VPG(features,
                                     v,
                                     policy,
                                     discount_factor=discount_factor,
                                     min_batch_size=min_batch_size),
                                 episodic_lives=True)
Esempio n. 21
0
 def test_agent(self):
     features = FeatureNetwork(copy.deepcopy(self.feature_model))
     policy = SoftmaxPolicy(copy.deepcopy(self.policy_model))
     return DeepmindAtariBody(VACTestAgent(features, policy))
Esempio n. 22
0
class DiversityLearner:
    def __init__(
        self,
        model_fn,
        model_features,
        logger,
        device,
        num_targets,
        max_learn_steps,
        num_actions,
        obs_preproc,
        discount_factor=0.99,
        entropy_target=-2,
        lr_value=1e-3,
        lr_pi=1e-4,
        # Training settings
        polyak_rate=0.005,
        # Replay Buffer settings
        replay_start_size=5000,
        replay_buffer_size=1e6,
        # Exploration settings
        temperature_initial=0.1,
        lr_temperature=1e-5,
        entropy_target_scaling=1.,
    ):
        self.writer = writer = DummyWriter()
        eps = 1e-5
        self.discount_factor = discount_factor
        self.entropy_target = entropy_target
        self.temperature = temperature_initial
        self.lr_temperature = lr_temperature
        self.logger = logger
        self.device = device
        self.num_targets = num_targets
        self.max_learn_steps = max_learn_steps
        self.num_actions = num_actions

        final_anneal_step = (max_learn_steps)
        self.policy = DiversityPolicy(model_fn, model_features, num_actions,
                                      num_targets, obs_preproc, device)
        self.policy = self.policy.to(device)
        self.obs_preproc = obs_preproc
        policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps)
        self.policy_learner = SoftmaxPolicy(self.policy,
                                            policy_optimizer,
                                            scheduler=CosineAnnealingLR(
                                                policy_optimizer,
                                                final_anneal_step),
                                            writer=writer)

        value_feature_model = model_fn().to(device)
        q_models = [
            DuelingQValueLayer(model_features, num_targets,
                               num_actions).to(device) for i in range(2)
        ]
        v_model = ValueLayer(model_features, num_targets,
                             num_actions).to(device)
        feature_optimizer = Adam(value_feature_model.parameters(),
                                 lr=lr_value,
                                 eps=eps)
        q_optimizers = [
            Adam(q_models[i].parameters(), lr=lr_value, eps=eps)
            for i in range(2)
        ]
        v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps)

        self.features = FeatureNetwork(
            value_feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            # clip_grad=clip_grad,
            writer=writer)

        self.qs = [
            QContinuous(q_models[i],
                        q_optimizers[i],
                        scheduler=CosineAnnealingLR(q_optimizers[i],
                                                    final_anneal_step),
                        writer=writer,
                        name=f'q_{i}') for i in range(2)
        ]

        self.v = VNetwork(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

    def learn_step(self, idxs, transition_batch, weights):
        Otm1, targ_vec, old_action, env_rew, done, Ot = transition_batch
        batch_size = len(Ot)
        obsm1 = self.obs_preproc(torch.tensor(Otm1, device=self.device))
        targ_vec = torch.tensor(targ_vec, device=self.device)
        actions = torch.tensor(old_action, device=self.device)
        rewards = torch.tensor(env_rew, device=self.device)
        done = torch.tensor(done, device=self.device).float().to(self.device)
        next_obs = self.obs_preproc(torch.tensor(Ot, device=self.device))
        weights = torch.tensor(weights, device=self.device)
        # assert (not (Otm1 == Ot).all())
        # print(self.device)
        states = StateArray(
            {
                'observation': obsm1,
                'reward': rewards,
                'done': done,
            },
            shape=(batch_size, ))
        # print(states['mask'])
        next_states = StateArray(
            {
                'observation': obsm1,
                'reward': torch.zeros(batch_size, device=self.device),
                'done': torch.zeros(batch_size, device=self.device),
                'mask': torch.ones(batch_size, device=self.device),
            },
            shape=(batch_size, ))

        # prediction_reward = self.predictor(Ot) * targ_vec
        with torch.no_grad():
            distribution = self.policy_learner(states)
            _log_probs = distribution.log_prob(actions).detach().squeeze()
        value_feature1 = self.features(states)
        value_feature2 = self.features(next_states)
        _actions = distribution.sample()  #torch.argmax(_log_probs, axis=-1)
        q_targets = rewards + self.discount_factor * self.v.target(
            value_feature2).detach()
        # print(value_feature1)
        v_targets = torch.min(
            self.qs[0].target(value_feature1, _actions),
            self.qs[1].target(value_feature1, _actions),
        ) - self.temperature * _log_probs
        # update Q and V-functions
        # print(q_targets.min(),torch.min(
        #     self.qs[0].target(value_feature1, _actions),
        #     self.qs[1].target(value_feature1, _actions),
        # ))
        for i in range(2):
            self.qs[i].reinforce(
                mse_loss(self.qs[i](value_feature1, actions), q_targets))
        # print(self.v(value_feature1).shape)
        # print(v_targets.shape)
        self.v.reinforce(mse_loss(self.v(value_feature1), v_targets))

        # update policy
        distribution = self.policy_learner(states)
        _actions2 = distribution.sample()
        _log_probs2 = distribution.log_prob(_actions2).squeeze()
        loss = (-self.qs[0](value_feature1, _actions2).detach() +
                self.temperature * _log_probs2).mean()
        self.policy_learner.reinforce(loss)
        self.features.reinforce()
        self.qs[0].zero_grad()

        # adjust temperature
        temperature_grad = (_log_probs + self.entropy_target).mean()
        self.temperature += self.lr_temperature * temperature_grad.detach(
        ).cpu().numpy()
Esempio n. 23
0
class TestSoftmax(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(
            nn.Linear(STATE_DIM, ACTIONS)
        )
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
        self.policy = SoftmaxPolicy(self.model, optimizer)

    def test_run(self):
        state1 = State(torch.randn(1, STATE_DIM))
        dist1 = self.policy(state1)
        action1 = dist1.sample()
        log_prob1 = dist1.log_prob(action1)
        self.assertEqual(action1.item(), 0)

        state2 = State(torch.randn(1, STATE_DIM))
        dist2 = self.policy(state2)
        action2 = dist2.sample()
        log_prob2 = dist2.log_prob(action2)
        self.assertEqual(action2.item(), 2)

        loss = -(torch.tensor([-1, 1000000]) * torch.cat((log_prob1, log_prob2))).mean()
        self.policy.reinforce(loss)

        state3 = State(torch.randn(1, STATE_DIM))
        dist3 = self.policy(state3)
        action3 = dist3.sample()
        self.assertEqual(action3.item(), 2)

    def test_multi_action(self):
        states = State(torch.randn(3, STATE_DIM))
        actions = self.policy(states).sample()
        tt.assert_equal(actions, torch.tensor([2, 2, 0]))

    def test_list(self):
        torch.manual_seed(1)
        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
        dist = self.policy(states)
        actions = dist.sample()
        log_probs = dist.log_prob(actions)
        tt.assert_equal(actions, torch.tensor([1, 2, 1]))
        loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean()
        self.policy.reinforce(loss)

    def test_reinforce(self):
        def loss(log_probs):
            return -log_probs.mean()

        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1]))
        actions = self.policy.eval(states).sample()

        # notice the values increase with each successive reinforce
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3)
        self.policy.reinforce(loss(log_probs))
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3)
        self.policy.reinforce(loss(log_probs))
        log_probs = self.policy(states).log_prob(actions)
        tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
Esempio n. 24
0
    def __init__(
        self,
        model_fn,
        model_features,
        logger,
        device,
        num_targets,
        max_learn_steps,
        num_actions,
        obs_preproc,
        discount_factor=0.99,
        entropy_target=-2,
        lr_value=1e-3,
        lr_pi=1e-4,
        # Training settings
        polyak_rate=0.005,
        # Replay Buffer settings
        replay_start_size=5000,
        replay_buffer_size=1e6,
        # Exploration settings
        temperature_initial=0.1,
        lr_temperature=1e-5,
        entropy_target_scaling=1.,
    ):
        self.writer = writer = DummyWriter()
        eps = 1e-5
        self.discount_factor = discount_factor
        self.entropy_target = entropy_target
        self.temperature = temperature_initial
        self.lr_temperature = lr_temperature
        self.logger = logger
        self.device = device
        self.num_targets = num_targets
        self.max_learn_steps = max_learn_steps
        self.num_actions = num_actions

        final_anneal_step = (max_learn_steps)
        self.policy = DiversityPolicy(model_fn, model_features, num_actions,
                                      num_targets, obs_preproc, device)
        self.policy = self.policy.to(device)
        self.obs_preproc = obs_preproc
        policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps)
        self.policy_learner = SoftmaxPolicy(self.policy,
                                            policy_optimizer,
                                            scheduler=CosineAnnealingLR(
                                                policy_optimizer,
                                                final_anneal_step),
                                            writer=writer)

        value_feature_model = model_fn().to(device)
        q_models = [
            DuelingQValueLayer(model_features, num_targets,
                               num_actions).to(device) for i in range(2)
        ]
        v_model = ValueLayer(model_features, num_targets,
                             num_actions).to(device)
        feature_optimizer = Adam(value_feature_model.parameters(),
                                 lr=lr_value,
                                 eps=eps)
        q_optimizers = [
            Adam(q_models[i].parameters(), lr=lr_value, eps=eps)
            for i in range(2)
        ]
        v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps)

        self.features = FeatureNetwork(
            value_feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            # clip_grad=clip_grad,
            writer=writer)

        self.qs = [
            QContinuous(q_models[i],
                        q_optimizers[i],
                        scheduler=CosineAnnealingLR(q_optimizers[i],
                                                    final_anneal_step),
                        writer=writer,
                        name=f'q_{i}') for i in range(2)
        ]

        self.v = VNetwork(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )
Esempio n. 25
0
 def test_agent(self):
     features = FeatureNetwork(copy.deepcopy(self.feature_model))
     policy = SoftmaxPolicy(copy.deepcopy(self.policy_model))
     return VPGTestAgent(features, policy)
class TestSoftmax(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS))
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
        self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS)

    def test_run(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.item(), 0)
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.item(), 2)
        self.policy.reinforce(torch.tensor([-1, 1000000]).float())
        action = self.policy(state)
        self.assertEqual(action.item(), 2)

    def test_multi_action(self):
        states = State(torch.randn(3, STATE_DIM))
        actions = self.policy(states)
        tt.assert_equal(actions, torch.tensor([2, 2, 0]))
        self.policy.reinforce(torch.tensor([[1, 2, 3]]).float())

    def test_multi_batch_reinforce(self):
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float())
        self.policy.reinforce(torch.tensor([1, 2]).float())
        with self.assertRaises(Exception):
            self.policy.reinforce(torch.tensor([1, 2]).float())

    def test_list(self):
        torch.manual_seed(1)
        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
        actions = self.policy(states)
        tt.assert_equal(actions, torch.tensor([1, 2, 1]))
        self.policy.reinforce(torch.tensor([[1, 2, 3]]).float())

    def test_action_prob(self):
        torch.manual_seed(1)
        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
        with torch.no_grad():
            actions = self.policy(states)
        probs = self.policy(states, action=actions)
        tt.assert_almost_equal(probs,
                               torch.tensor([0.204, 0.333, 0.217]),
                               decimal=3)
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(
            feature_model.parameters(), lr=lr, eps=eps
        )
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(
                    clip_initial,
                    clip_final,
                    0,
                    final_anneal_step,
                    name='clip',
                    writer=writer
                ),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
            )
        )
Esempio n. 28
0
class TestSoftmax(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS))
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
        self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS)

    def test_run(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.item(), 0)
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.item(), 2)
        self.policy.reinforce(torch.tensor([-1, 1000000]).float())
        action = self.policy(state)
        self.assertEqual(action.item(), 2)

    def test_multi_action(self):
        states = State(torch.randn(3, STATE_DIM))
        actions = self.policy(states)
        tt.assert_equal(actions, torch.tensor([2, 2, 0]))
        self.policy.reinforce(torch.tensor([[1, 2, 3]]).float())

    def test_multi_batch_reinforce(self):
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy(State(torch.randn(2, STATE_DIM)))
        self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float())
        self.policy.reinforce(torch.tensor([1, 2]).float())
        with self.assertRaises(Exception):
            self.policy.reinforce(torch.tensor([1, 2]).float())

    def test_list(self):
        torch.manual_seed(1)
        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
        actions = self.policy(states)
        tt.assert_equal(actions, torch.tensor([1, 2, 1]))
        self.policy.reinforce(torch.tensor([[1, 2, 3]]).float())

    def test_action_prob(self):
        torch.manual_seed(1)
        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
        with torch.no_grad():
            actions = self.policy(states)
        log_probs = self.policy(states, action=actions)
        tt.assert_almost_equal(log_probs,
                               torch.tensor([-1.59, -1.099, -1.528]),
                               decimal=3)

    def test_custom_loss(self):
        def loss(log_probs):
            return -log_probs.mean()

        states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1]))
        actions = self.policy.eval(states)

        # notice the values increase with each successive reinforce
        log_probs = self.policy(states, actions)
        tt.assert_almost_equal(log_probs,
                               torch.tensor([-0.84, -0.62, -0.757]),
                               decimal=3)
        self.policy.reinforce(loss)
        log_probs = self.policy(states, actions)
        tt.assert_almost_equal(log_probs,
                               torch.tensor([-0.811, -0.561, -0.701]),
                               decimal=3)
        self.policy.reinforce(loss)
        log_probs = self.policy(states, actions)
        tt.assert_almost_equal(log_probs,
                               torch.tensor([-0.785, -0.51, -0.651]),
                               decimal=3)