def _ppo(envs, writer=DummyWriter()): env = envs[0] feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer) return PPO(features, v, policy, epsilon=epsilon, epochs=epochs, lam=lam, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer)
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) features = Identity(self.device) v = VNetwork( self.value_model, value_optimizer, loss_scaling=self.hyperparameters['value_loss_scaling'], clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), ) policy = GaussianPolicy( self.policy_model, policy_optimizer, self.action_space, clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters['clip_initial'], self.hyperparameters['clip_final'], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters['epochs'], minibatches=self.hyperparameters['minibatches'], n_envs=self.hyperparameters['n_envs'], n_steps=self.hyperparameters['n_steps'], discount_factor=self.hyperparameters['discount_factor'], lam=self.hyperparameters['lam'], entropy_loss_scaling=self. hyperparameters['entropy_loss_scaling'], writer=writer, ))
def _ppo(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam( feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler( clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, ) )