def test_ppo_reset_spaces_conflict(self):
     n_envs = 4
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     with self.assertRaises(RuntimeError):
         # space conflict
         model.set_env(env)
 def test_ppo_param_order_non_delayed_vs_delayed(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     obs_space = env.observation_space
     act_space = env.action_space
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     batch = next(iter(model.sampler(batch_size)))
     ub_utils.set_seed(1)
     model._train_model(batch)
     # delayed
     model2 = ppo_model.PPO(None, observation_space=obs_space,
                                  action_space=act_space)
     model2.setup()
     ub_utils.set_seed(1)
     model2._train_model(batch)
     # check trainable variables order
     self.assertVariables(model.trainable_variables,
                         model2.trainable_variables)
     # check optimizer variables order
     self.assertVariables(model.optimizer.variables(),
                         model2.optimizer.variables())
 def test_dqn_run(self):
     n_envs = 3
     warmup_steps = 50
     buffer_size = 90
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env,
                           buffer_size=buffer_size,
                           warmup_steps=warmup_steps)
     obs_shape = env.observation_space.shape
     act_shape = env.action_space.shape
     n_samples = 100
     n_slots = buffer_size // n_envs
     model.run(n_samples)
     buf = model.buffer
     self.assertEqual(n_slots * n_envs, len(buf))
     self.assertTrue(buf.ready_for_sample)
     self.assertTrue(buf.isfull)
     # test buffer contents
     self.assertArrayEqual((n_slots, n_envs, *obs_shape),
                           buf.data['obs'].shape)
     self.assertArrayEqual((n_slots, n_envs, *act_shape),
                           buf.data['act'].shape)
     self.assertArrayEqual((n_slots, n_envs), buf.data['rew'].shape)
     self.assertArrayEqual((n_slots, n_envs), buf.data['done'].shape)
 def test_dqn_call_predict(self):
     envs = [FakeImageEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env)
     batch_size = 3
     obs_space = env.observation_space
     act_space = env.action_space
     act_dims = act_space.n
     obs = np.asarray([obs_space.sample() for _ in range(batch_size)])
     # test call
     act, val = model(obs, proc_obs=True)
     self.assertArrayEqual((batch_size, ), act.shape)
     self.assertArrayEqual((batch_size, ), val.shape)
     # test predict
     act = model.predict(obs_space.sample())
     self.assertArrayEqual([], act.shape)
     # test dueling
     model = dqn_model.DQN(env, dueling=True)
     act, val = model(obs, proc_obs=True)
     # test call
     self.assertArrayEqual((batch_size, ), act.shape)
     self.assertArrayEqual((batch_size, ), val.shape)
     # test predict
     act = model.predict(obs_space.sample())
     self.assertArrayEqual([], act.shape)
 def test_ppo_run(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     obs_shape = env.observation_space.shape
     act_shape = env.action_space.shape
     n_samples = 100
     model.run(n_samples)
     buf = model.buffer
     self.assertEqual(n_samples*n_envs, len(buf))
     self.assertTrue(buf.ready_for_sample)
     self.assertFalse(buf.isfull)
     # test buffer contents
     self.assertArrayEqual((n_samples, n_envs, *obs_shape), 
                           buf.data['obs'].shape)
     self.assertArrayEqual((n_samples, n_envs, *act_shape),
                           buf.data['act'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['done'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['rew'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['val'].shape)
     self.assertArrayEqual((n_samples, n_envs),
                           buf.data['logp'].shape)
 def test_dqn_reg_loss(self):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env)
     loss = model.reg_loss(model.agent.trainable_variables)
     self.assertArrayEqual([], loss.shape)
     self.assertFalse(np.all(np.isnan(loss)))
 def test_dqn_save_load(self):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     model = dqn_model.DQN(env, warmup_steps=5)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     ub_utils.set_seed(2)
     batch = model.sampler(batch_size)
     batch['next_obs'] = model.sampler.rel[1]['obs']
     model._train_model(batch)
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         # save & load model
         model.save(save_path)
         loaded_model = dqn_model.DQN.load(save_path)
     # check model setup
     self.assertTrue(loaded_model.agent is not None)
     self.assertTrue(loaded_model.buffer is not None)
     self.assertTrue(loaded_model.optimizer is not None)
     # check if config is correctly restored
     model_config = model.get_config()
     loaded_config = loaded_model.get_config()
     self.assertEqual(set(model_config.keys()), set(loaded_config.keys()))
     for key in model_config:
         self.assertEqual(model_config[key], loaded_config[key], key)
     # check if all network variables are correctly restored
     self.assertVariables(model.trainable_variables,
                          loaded_model.trainable_variables)
     # test optimizers
     # load optimizer params
     batches = []
     for i in range(3):
         batch = model.sampler(batch_size)
         batch['next_obs'] = model.sampler.rel[1]['obs']
         batches.append(batch)
     ub_utils.set_seed(1)
     for batch in batches:
         losses1, td1 = model._train_model(batch)
     for batch in batches:
         losses2, td2 = loaded_model._train_model(batch)
     # check if losses are matches
     self.assertEqual(set(losses1.keys()), set(losses2.keys()))
     for key in losses1.keys():
         self.assertEqual(losses1[key], losses2[key])
     self.assertAllClose(td1, td2)
     # check if vars are same
     self.assertVariables(model.trainable_variables,
                          loaded_model.trainable_variables)
     # check if params of the optimizers are same
     self.assertVariables(model.optimizer.variables(),
                          loaded_model.optimizer.variables())
 def test_ppo_dual_clip_valu_clip(self):
     n_envs = 4
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env, value_clip=0.1, dual_clip=0.1)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = n_samples * n_envs * 1 / batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs)
 def test_ppo_delayed_setup(self):
     model = ppo_model.PPO(None)
     self.assertTrue(model.observation_space is None)
     self.assertTrue(model.action_space is None)
     self.assertTrue(model.agent is None)
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model.set_env(env)
     model.setup()
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     self.assertEqual(4+4+3+2, len(model.trainable_variables))
 def test_dqn_delayed_setup(self):
     model = dqn_model.DQN(None)
     self.assertTrue(model.observation_space is None)
     self.assertTrue(model.action_space is None)
     self.assertTrue(model.agent is None)
     envs = [FakeImageEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model.set_env(env)
     model.setup()
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     self.assertEqual((8 + 2) * 2, len(model.trainable_variables))
 def test_ppo_save_load(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     ub_utils.set_seed(2)
     batch = next(iter(model.sampler(batch_size)))
     model._train_model(batch)
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         # save & load model
         model.save(save_path)
         loaded_model = ppo_model.PPO.load(save_path)
     # check model setup
     self.assertTrue(loaded_model.agent is not None)
     self.assertTrue(loaded_model.buffer is not None)
     self.assertTrue(loaded_model.optimizer is not None)
     # check if config is correctly restored
     model_config = model.get_config()
     loaded_config = loaded_model.get_config()
     self.assertEqual(set(model_config.keys()), set(loaded_config.keys()))
     for key in model_config:
         self.assertEqual(model_config[key], loaded_config[key])
     # check if all network variables are correctly restored
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # test optimizers
     # load optimizer params
     batches = [batch for batch in model.sampler(batch_size)]
     ub_utils.set_seed(1)
     for batch in batches:
         losses1, kl1 = model._train_step(batch)
     ub_utils.set_seed(1)
     for batch in batches:
         losses2, kl2 = loaded_model._train_step(batch)
     # check if losses are matched
     self.assertEqual(set(losses1.keys()), set(losses2.keys()))
     for key in losses1.keys():
         self.assertEqual(losses1[key], losses2[key])
     self.assertAllClose(kl1, kl2)
     # check if vars are same
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # check if params of the optimizer are same
     self.assertVariables(model.optimizer.variables(),
                     loaded_model.optimizer.variables())
 def test_ppo_predict_batch(self):
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     batch_size = 3
     obs_space = env.observation_space
     act_space = env.action_space
     act_dim = act_space.shape[0]
     obs = np.asarray([obs_space.sample() for _ in range(batch_size)])
     act = model.predict(obs, det=True)
     self.assertArrayEqual((batch_size, act_dim), act.shape)
     act = model.predict(obs, det=False)
     self.assertArrayEqual((batch_size, act_dim), act.shape)
 def test_ppo_train(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = n_samples * n_envs * n_subepochs / batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs)
     self.assertEqual(exp_gradsteps, model.num_gradsteps)
     self.assertEqual(n_subepochs, model.num_subepochs)
 def test_dqn_setup_image_obs(self):
     envs = [FakeImageEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env)
     self.assertEqual(3, model.n_envs)
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     # nature_cnn + value
     self.assertEqual((8 + 2) * 2, len(model.trainable_variables))
     # test dueling
     model = dqn_model.DQN(env, dueling=True)
     # nature_cnn + value(dueling)
     self.assertEqual((8 + 4) * 2, len(model.trainable_variables))
 def test_dqn_train(self, huber):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, huber=huber)
     n_samples = 10
     batch_size = 10
     n_gradsteps = 4
     n_subepochs = 1
     target_update = 2
     exp_gradsteps = n_subepochs * n_gradsteps
     model.run(n_samples)
     model.train(batch_size, n_subepochs, n_gradsteps, target_update)
     self.assertEqual(exp_gradsteps, model.num_gradsteps)
     self.assertEqual(n_subepochs, model.num_subepochs)
 def test_ppo_setup_non_image_obs(self):
     envs = [FakeContinuousEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     # no share net, mlp
     model = ppo_model.PPO(env, mlp_units=[64, 64, 64])
     self.assertEqual(3, model.n_envs)
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+6+3+2, len(model.trainable_variables))
     # share net
     model = ppo_model.PPO(env, share_net=True,
                         force_mlp=False, mlp_units=[64, 64, 64])
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+3+2, len(model.trainable_variables))
 def test_ppo_train_with_target_kl(self):
     n_envs = 3
     target_kl = 0.1
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(0)
     ub_utils.set_seed(0)
     model = ppo_model.PPO(env, target_kl=target_kl)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs) 
     self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps)
     self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
 def test_dqn_train_model(self, huber):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, huber=huber)
     n_samples = 10
     batch_size = 2
     model.run(n_samples)
     samp = model.sampler
     batch = samp.sample(batch_size)
     batch['next_obs'] = samp.rel[1]['obs']
     losses, td = model._train_model(batch)
     for key, loss in losses.items():
         self.assertArrayEqual([], loss.shape)
         self.assertFalse(np.all(np.isnan(loss)))
     self.assertArrayEqual((batch_size, ), td.shape)
     self.assertFalse(np.all(np.isnan(td)))
     self.assertTrue(np.all(td >= 0))
 def test_dqn_prioritized(self, huber):
     n_envs = 3
     ub_utils.set_seed(1)
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, prioritized=True, warmup_steps=5)
     self.assertTrue(isinstance(model.prio_beta, ub_sche.Scheduler))
     self.assertTrue(isinstance(model.sampler, ub_data.PriorSampler))
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     res = model.sampler._weight_tree[:n_samples * n_envs]
     exp = np.ones_like(res, dtype=np.float32)
     self.assertArrayEqual(exp, res)
     model._train_step(batch_size)
     res = model.sampler._weight_tree[:n_samples * n_envs]
     self.assertArrayNotEqual(exp, res)
     self.assertTrue(np.all(res >= 0.0))
 def test_ppo_setup_image_obs(self):
     envs = [FakeImageEnv() for _ in range(3)]
     env = ub_vec.VecEnv(envs)
     model = ppo_model.PPO(env)
     self.assertEqual(3, model.n_envs)
     self.assertTrue(model.observation_space is not None)
     self.assertTrue(model.action_space is not None)
     # nature_cnn + nature_cnn + policy + value
     self.assertEqual(8+8+2+2, len(model.trainable_variables))
     # test share net
     model = ppo_model.PPO(env, share_net=True)
     # nature_cnn + policy + value
     self.assertEqual(8+2+2, len(model.trainable_variables))
     # test force mlp
     model = ppo_model.PPO(env, share_net=False,
                         force_mlp=True, mlp_units=[64, 64, 64])
     # mlp(3) + mlp(3) + policy + value
     self.assertEqual(6+6+2+2, len(model.trainable_variables))
 def test_ppo_learn(self):
     n_envs = 4
     n_steps = 125
     n_subepochs = 2
     n_epochs = 2
     batch_size = 50
     total_steps = n_envs * n_steps * n_epochs
     total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5)
                         * n_subepochs * n_epochs)
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     eval_env = FakeContinuousEnv()
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(
         env, 
         batch_size=batch_size,
         n_steps=n_steps,
         n_subepochs=n_subepochs,
     )
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         model.learn(
             total_steps,
             log_interval=1,
             eval_env=eval_env,
             eval_interval=1,
             eval_episodes=1,
             eval_max_steps=10,
             save_path=save_path,
             save_interval=1,
             tb_logdir=save_path,
             reset_timesteps=True,
             verbose=3
         )
         # test load weights
         ppo_model.PPO.load(save_path)
         # test model state
         self.assertEqual(total_steps, model.num_timesteps)
         self.assertEqual(n_epochs, model.num_epochs)
         self.assertEqual(n_subepochs*n_epochs, model.num_subepochs)
         self.assertEqual(total_gradsteps, model.num_gradsteps)
         self.assertEqual(1.0, model.progress)
 def test_dqn_td_error(self, huber):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, huber=huber)
     obs = env.observation_space.sample()
     obs = obs.reshape(1, *obs.shape)
     next_obs = env.observation_space.sample()
     next_obs = next_obs.reshape(1, *next_obs.shape)
     act = env.action_space.sample()
     act = np.asarray([act], dtype=np.int64)
     done = np.asarray([False], dtype=np.bool_)
     rew = np.asarray([1.0], dtype=np.float32)
     # test td error
     td = model.td_error(obs, act, done, rew, next_obs)
     self.assertArrayEqual((1, ), td.shape)
     self.assertFalse(np.all(np.isnan(td)))
     # test td loss
     loss = model.td_loss(td)
     self.assertArrayEqual([], loss.shape)
     self.assertFalse(np.all(np.isnan(loss)))
 def test_ppo_gae(self):
     n_envs = 2
     gamma = 0.99
     lam = 0.95
     envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     n_samples = 20
     model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam)
     model.collect(n_samples)
     exp_gae = legacy_gae(
         rew   = model.buffer.data['rew'], 
         val   = model.buffer.data['val'], 
         done  = model.buffer.data['done'], 
         gamma = gamma, 
         lam   = lam
     )
     env.seed(1)
     model.run(n_samples)
     gae = model.buffer.data['adv']
     self.assertAllClose(exp_gae, gae)
 def test_dqn_sample_nstep_batch(self):
     ub_utils.set_seed(1)
     n_envs = 3
     gamma = 0.99
     multi_step = 2
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, multi_step=multi_step, gamma=gamma)
     n_samples = 10
     batch_size = 1
     model.run(n_samples)
     samp = model.sampler
     batch = model._sample_nstep_batch(batch_size)
     orig_batch = samp.rel[0]
     self.assertArrayEqual(orig_batch['obs'], batch['obs'])
     self.assertArrayEqual(orig_batch['act'], batch['act'])
     self.assertArrayNotEqual(orig_batch['rew'], batch['rew'])
     next_batch = samp.rel[1]
     # depends on random seed
     self.assertAllClose(orig_batch['rew'] + gamma * next_batch['rew'],
                         batch['rew'],
                         atol=1e-6)
     nnext_batch = samp.rel[2]
     self.assertArrayEqual(nnext_batch['obs'], batch['next_obs'])