def _make_trainer(self, train_env, eval_env, output_dir, model=None):
     if model is None:
         model = lambda: [layers.Dense(1)]
     return ppo_trainer.PPO(
         train_env=train_env,
         eval_env=eval_env,
         policy_and_value_model=model,
         n_optimizer_steps=1,
         output_dir=output_dir,
         random_seed=0,
         boundary=2,
     )
 def _run_training_loop(self, train_env, eval_env, output_dir, model=None):
   if model is None:
     model = lambda: [layers.Dense(1)]
   n_epochs = 2
   # Run the training loop.
   trainer = ppo_trainer.PPO(
       train_env=train_env,
       eval_env=eval_env,
       policy_and_value_model=model,
       n_optimizer_steps=1,
       output_dir=output_dir,
       random_seed=0,
       boundary=2,
   )
   trainer.training_loop(n_epochs=n_epochs)
Beispiel #3
0
 def _make_trainer(self,
                   train_env,
                   eval_env,
                   output_dir,
                   model=None,
                   **kwargs):
     if model is None:
         model = lambda: layers.Serial(layers.Dense(1))
     return ppo_trainer.PPO(train_env=train_env,
                            eval_env=eval_env,
                            policy_and_value_model=model,
                            n_optimizer_steps=1,
                            output_dir=output_dir,
                            random_seed=0,
                            max_timestep=3,
                            boundary=2,
                            save_every_n=1,
                            **kwargs)