def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"]) v = VNetwork(self.value_model, value_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return A2C( features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer)
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork( value_model, value_optimizer, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer ) return A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer )
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = conv_features().to(device) value_model = value_net().to(device) policy_model = policy_net(env).to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps ) value_optimizer = RMSprop( value_model.parameters(), alpha=alpha, lr=lr, eps=eps ) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad ) v = ValueNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return ParallelAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), envs, )
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs) value_model = value_head().to(device) policy_model = policy_head(env).to(device) feature_model = conv_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return FrameStack( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer), writer=writer ), size=4 )
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs * 4) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer ), )
def _a2c(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr, eps=eps ) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / (self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( A2C(features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer), )