def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"])

        v = VNetwork(self.value_model,
                     value_optimizer,
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return A2C(
            features,
            v,
            policy,
            n_envs=self.hyperparameters["n_envs"],
            n_steps=self.hyperparameters["n_steps"],
            discount_factor=self.hyperparameters["discount_factor"],
            entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
            writer=writer)
Example #2
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model, feature_optimizer, clip_grad=clip_grad)
        v = VNetwork(
            value_model,
            value_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        return A2C(
            features,
            v,
            policy,
            n_envs=n_envs,
            n_steps=n_steps,
            discount_factor=discount_factor,
            entropy_loss_scaling=entropy_loss_scaling,
            writer=writer
        )
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]

        feature_model = conv_features().to(device)
        value_model = value_net().to(device)
        policy_model = policy_net(env).to(device)

        feature_optimizer = RMSprop(
            feature_model.parameters(),
            alpha=alpha,
            lr=lr * feature_lr_scaling,
            eps=eps
        )
        value_optimizer = RMSprop(
            value_model.parameters(),
            alpha=alpha,
            lr=lr,
            eps=eps
        )
        policy_optimizer = RMSprop(
            policy_model.parameters(),
            alpha=alpha,
            lr=lr,
            eps=eps
        )

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad
        )
        v = ValueNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )

        return ParallelAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
            ),
            envs,
        )
Example #4
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs)

        value_model = value_head().to(device)
        policy_model = policy_head(env).to(device)
        feature_model = conv_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return FrameStack(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer),
                writer=writer
            ),
            size=4
        )
Example #5
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs * 4)

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer
            ),
        )
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = RMSprop(
            feature_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )
        value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps)
        policy_optimizer = RMSprop(
            policy_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
            ),
        )
Example #7
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / (self.hyperparameters['n_steps'] *
                                   self.hyperparameters['n_envs'])

        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, n_updates),
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(value_optimizer, n_updates),
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer, n_updates),
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return DeepmindAtariBody(
            A2C(features,
                v,
                policy,
                n_envs=self.hyperparameters["n_envs"],
                n_steps=self.hyperparameters["n_steps"],
                discount_factor=self.hyperparameters["discount_factor"],
                entropy_loss_scaling=self.
                hyperparameters["entropy_loss_scaling"],
                writer=writer), )