コード例 #1
0
 def test_ppo_param_order_non_delayed_vs_delayed(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     obs_space = env.observation_space
     act_space = env.action_space
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     batch = next(iter(model.sampler(batch_size)))
     ub_utils.set_seed(1)
     model._train_model(batch)
     # delayed
     model2 = ppo_model.PPO(None, observation_space=obs_space,
                                  action_space=act_space)
     model2.setup()
     ub_utils.set_seed(1)
     model2._train_model(batch)
     # check trainable variables order
     self.assertVariables(model.trainable_variables,
                         model2.trainable_variables)
     # check optimizer variables order
     self.assertVariables(model.optimizer.variables(),
                         model2.optimizer.variables())
コード例 #2
0
 def test_ppo_setup_non_image_obs(self):
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     # no share net, mlp
     model = ppo_model.PPO(env, mlp_units=[64, 64, 64])
     self.assertEqual(3, model.n_envs)
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+6+3+2, len(model.trainable_variables))
     # share net
     model = ppo_model.PPO(env, share_net=True,
                         force_mlp=False, mlp_units=[64, 64, 64])
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+3+2, len(model.trainable_variables))
コード例 #3
0
 def test_ppo_run(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     obs_shape = env.observation_space.shape
     act_shape = env.action_space.shape
     n_samples = 100
     model.run(n_samples)
     buf = model.buffer
     self.assertEqual(n_samples*n_envs, len(buf))
     self.assertTrue(buf.ready_for_sample)
     self.assertFalse(buf.isfull)
     # test buffer contents
     self.assertArrayEqual((n_samples, n_envs, *obs_shape), 
                           buf.data['obs'].shape)
     self.assertArrayEqual((n_samples, n_envs, *act_shape),
                           buf.data['act'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['done'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['rew'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['val'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['logp'].shape)
コード例 #4
0
 def test_ppo_setup_image_obs(self):
     envs = [FakeImageEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     self.assertEqual(3, model.n_envs)
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     # nature_cnn + nature_cnn + policy + value
     self.assertEqual(8+8+2+2, len(model.trainable_variables))
     # test share net
     model = ppo_model.PPO(env, share_net=True)
     # nature_cnn + policy + value
     self.assertEqual(8+2+2, len(model.trainable_variables))
     # test force mlp
     model = ppo_model.PPO(env, share_net=False,
                         force_mlp=True, mlp_units=[64, 64, 64])
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+6+2+2, len(model.trainable_variables))
コード例 #5
0
 def test_ppo_reset_spaces_conflict(self):
     n_envs = 4
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     with self.assertRaises(RuntimeError):
         # space conflict
         model.set_env(env)
コード例 #6
0
 def test_ppo_dual_clip_valu_clip(self):
     n_envs = 4
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env, value_clip=0.1, dual_clip=0.1)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = n_samples * n_envs * 1 / batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs)
コード例 #7
0
 def test_ppo_delayed_setup(self):
     model = ppo_model.PPO(None)
     self.assertTrue(model.observation_space is None)
     self.assertTrue(model.action_space is None)
     self.assertTrue(model.agent is None)
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model.set_env(env)
     model.setup()
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     self.assertEqual(4+4+3+2, len(model.trainable_variables))
コード例 #8
0
 def test_ppo_save_load(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     ub_utils.set_seed(2)
     batch = next(iter(model.sampler(batch_size)))
     model._train_model(batch)
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         # save & load model
         model.save(save_path)
         loaded_model = ppo_model.PPO.load(save_path)
     # check model setup
     self.assertTrue(loaded_model.agent is not None)
     self.assertTrue(loaded_model.buffer is not None)
     self.assertTrue(loaded_model.optimizer is not None)
     # check if config is correctly restored
     model_config = model.get_config()
     loaded_config = loaded_model.get_config()
     self.assertEqual(set(model_config.keys()), set(loaded_config.keys()))
     for key in model_config:
         self.assertEqual(model_config[key], loaded_config[key])
     # check if all network variables are correctly restored
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # test optimizers
     # load optimizer params
     batches = [batch for batch in model.sampler(batch_size)]
     ub_utils.set_seed(1)
     for batch in batches:
         losses1, kl1 = model._train_step(batch)
     ub_utils.set_seed(1)
     for batch in batches:
         losses2, kl2 = loaded_model._train_step(batch)
     # check if losses are matched
     self.assertEqual(set(losses1.keys()), set(losses2.keys()))
     for key in losses1.keys():
         self.assertEqual(losses1[key], losses2[key])
     self.assertAllClose(kl1, kl2)
     # check if vars are same
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # check if params of the optimizer are same
     self.assertVariables(model.optimizer.variables(),
                     loaded_model.optimizer.variables())
コード例 #9
0
 def test_ppo_train(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = n_samples * n_envs * n_subepochs / batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs)
     self.assertEqual(exp_gradsteps, model.num_gradsteps)
     self.assertEqual(n_subepochs, model.num_subepochs)
コード例 #10
0
 def test_ppo_predict_batch(self):
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     batch_size = 3
     obs_space = env.observation_space
     act_space = env.action_space
     act_dim = act_space.shape[0]
     obs = np.asarray([obs_space.sample() for _ in range(batch_size)])
     act = model.predict(obs, det=True)
     self.assertArrayEqual((batch_size, act_dim), act.shape)
     act = model.predict(obs, det=False)
     self.assertArrayEqual((batch_size, act_dim), act.shape)
コード例 #11
0
 def test_ppo_train_with_target_kl(self):
     n_envs = 3
     target_kl = 0.1
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(0)
     ub_utils.set_seed(0)
     model = ppo_model.PPO(env, target_kl=target_kl)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs) 
     self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps)
     self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
コード例 #12
0
 def test_ppo_learn(self):
     n_envs = 4
     n_steps = 125
     n_subepochs = 2
     n_epochs = 2
     batch_size = 50
     total_steps = n_envs * n_steps * n_epochs
     total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5)
                         * n_subepochs * n_epochs)
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     eval_env = FakeContinuousEnv()
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(
         env, 
         batch_size=batch_size,
         n_steps=n_steps,
         n_subepochs=n_subepochs,
     )
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         model.learn(
             total_steps,
             log_interval=1,
             eval_env=eval_env,
             eval_interval=1,
             eval_episodes=1,
             eval_max_steps=10,
             save_path=save_path,
             save_interval=1,
             tb_logdir=save_path,
             reset_timesteps=True,
             verbose=3
         )
         # test load weights
         ppo_model.PPO.load(save_path)
         # test model state
         self.assertEqual(total_steps, model.num_timesteps)
         self.assertEqual(n_epochs, model.num_epochs)
         self.assertEqual(n_subepochs*n_epochs, model.num_subepochs)
         self.assertEqual(total_gradsteps, model.num_gradsteps)
         self.assertEqual(1.0, model.progress)
コード例 #13
0
 def test_ppo_gae(self):
     n_envs = 2
     gamma = 0.99
     lam = 0.95
     envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     n_samples = 20
     model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam)
     model.collect(n_samples)
     exp_gae = legacy_gae(
         rew   = model.buffer.data['rew'], 
         val   = model.buffer.data['val'], 
         done  = model.buffer.data['done'], 
         gamma = gamma, 
         lam   = lam
     )
     env.seed(1)
     model.run(n_samples)
     gae = model.buffer.data['adv']
     self.assertAllClose(exp_gae, gae)
コード例 #14
0
 def test_ppo_not_vec_env(self):
     env = FakeContinuousEnv()
     with self.assertRaises(RuntimeError):
         ppo_model.PPO(env)