def test_ppo_reset_spaces_conflict(self): n_envs = 4 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) with self.assertRaises(RuntimeError): # space conflict model.set_env(env)
def test_ppo_param_order_non_delayed_vs_delayed(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) obs_space = env.observation_space act_space = env.action_space model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps batch = next(iter(model.sampler(batch_size))) ub_utils.set_seed(1) model._train_model(batch) # delayed model2 = ppo_model.PPO(None, observation_space=obs_space, action_space=act_space) model2.setup() ub_utils.set_seed(1) model2._train_model(batch) # check trainable variables order self.assertVariables(model.trainable_variables, model2.trainable_variables) # check optimizer variables order self.assertVariables(model.optimizer.variables(), model2.optimizer.variables())
def test_dqn_run(self): n_envs = 3 warmup_steps = 50 buffer_size = 90 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, buffer_size=buffer_size, warmup_steps=warmup_steps) obs_shape = env.observation_space.shape act_shape = env.action_space.shape n_samples = 100 n_slots = buffer_size // n_envs model.run(n_samples) buf = model.buffer self.assertEqual(n_slots * n_envs, len(buf)) self.assertTrue(buf.ready_for_sample) self.assertTrue(buf.isfull) # test buffer contents self.assertArrayEqual((n_slots, n_envs, *obs_shape), buf.data['obs'].shape) self.assertArrayEqual((n_slots, n_envs, *act_shape), buf.data['act'].shape) self.assertArrayEqual((n_slots, n_envs), buf.data['rew'].shape) self.assertArrayEqual((n_slots, n_envs), buf.data['done'].shape)
def test_dqn_call_predict(self): envs = [FakeImageEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env) batch_size = 3 obs_space = env.observation_space act_space = env.action_space act_dims = act_space.n obs = np.asarray([obs_space.sample() for _ in range(batch_size)]) # test call act, val = model(obs, proc_obs=True) self.assertArrayEqual((batch_size, ), act.shape) self.assertArrayEqual((batch_size, ), val.shape) # test predict act = model.predict(obs_space.sample()) self.assertArrayEqual([], act.shape) # test dueling model = dqn_model.DQN(env, dueling=True) act, val = model(obs, proc_obs=True) # test call self.assertArrayEqual((batch_size, ), act.shape) self.assertArrayEqual((batch_size, ), val.shape) # test predict act = model.predict(obs_space.sample()) self.assertArrayEqual([], act.shape)
def test_ppo_run(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) obs_shape = env.observation_space.shape act_shape = env.action_space.shape n_samples = 100 model.run(n_samples) buf = model.buffer self.assertEqual(n_samples*n_envs, len(buf)) self.assertTrue(buf.ready_for_sample) self.assertFalse(buf.isfull) # test buffer contents self.assertArrayEqual((n_samples, n_envs, *obs_shape), buf.data['obs'].shape) self.assertArrayEqual((n_samples, n_envs, *act_shape), buf.data['act'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['done'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['rew'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['val'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['logp'].shape)
def test_dqn_reg_loss(self): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env) loss = model.reg_loss(model.agent.trainable_variables) self.assertArrayEqual([], loss.shape) self.assertFalse(np.all(np.isnan(loss)))
def test_dqn_save_load(self): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) model = dqn_model.DQN(env, warmup_steps=5) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps ub_utils.set_seed(2) batch = model.sampler(batch_size) batch['next_obs'] = model.sampler.rel[1]['obs'] model._train_model(batch) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir # save & load model model.save(save_path) loaded_model = dqn_model.DQN.load(save_path) # check model setup self.assertTrue(loaded_model.agent is not None) self.assertTrue(loaded_model.buffer is not None) self.assertTrue(loaded_model.optimizer is not None) # check if config is correctly restored model_config = model.get_config() loaded_config = loaded_model.get_config() self.assertEqual(set(model_config.keys()), set(loaded_config.keys())) for key in model_config: self.assertEqual(model_config[key], loaded_config[key], key) # check if all network variables are correctly restored self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # test optimizers # load optimizer params batches = [] for i in range(3): batch = model.sampler(batch_size) batch['next_obs'] = model.sampler.rel[1]['obs'] batches.append(batch) ub_utils.set_seed(1) for batch in batches: losses1, td1 = model._train_model(batch) for batch in batches: losses2, td2 = loaded_model._train_model(batch) # check if losses are matches self.assertEqual(set(losses1.keys()), set(losses2.keys())) for key in losses1.keys(): self.assertEqual(losses1[key], losses2[key]) self.assertAllClose(td1, td2) # check if vars are same self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # check if params of the optimizers are same self.assertVariables(model.optimizer.variables(), loaded_model.optimizer.variables())
def test_ppo_dual_clip_valu_clip(self): n_envs = 4 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env, value_clip=0.1, dual_clip=0.1) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = n_samples * n_envs * 1 / batch_size model.run(n_samples) model.train(batch_size, n_subepochs)
def test_ppo_delayed_setup(self): model = ppo_model.PPO(None) self.assertTrue(model.observation_space is None) self.assertTrue(model.action_space is None) self.assertTrue(model.agent is None) envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model.set_env(env) model.setup() self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) self.assertEqual(4+4+3+2, len(model.trainable_variables))
def test_dqn_delayed_setup(self): model = dqn_model.DQN(None) self.assertTrue(model.observation_space is None) self.assertTrue(model.action_space is None) self.assertTrue(model.agent is None) envs = [FakeImageEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model.set_env(env) model.setup() self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) self.assertEqual((8 + 2) * 2, len(model.trainable_variables))
def test_ppo_save_load(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps ub_utils.set_seed(2) batch = next(iter(model.sampler(batch_size))) model._train_model(batch) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir # save & load model model.save(save_path) loaded_model = ppo_model.PPO.load(save_path) # check model setup self.assertTrue(loaded_model.agent is not None) self.assertTrue(loaded_model.buffer is not None) self.assertTrue(loaded_model.optimizer is not None) # check if config is correctly restored model_config = model.get_config() loaded_config = loaded_model.get_config() self.assertEqual(set(model_config.keys()), set(loaded_config.keys())) for key in model_config: self.assertEqual(model_config[key], loaded_config[key]) # check if all network variables are correctly restored self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # test optimizers # load optimizer params batches = [batch for batch in model.sampler(batch_size)] ub_utils.set_seed(1) for batch in batches: losses1, kl1 = model._train_step(batch) ub_utils.set_seed(1) for batch in batches: losses2, kl2 = loaded_model._train_step(batch) # check if losses are matched self.assertEqual(set(losses1.keys()), set(losses2.keys())) for key in losses1.keys(): self.assertEqual(losses1[key], losses2[key]) self.assertAllClose(kl1, kl2) # check if vars are same self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # check if params of the optimizer are same self.assertVariables(model.optimizer.variables(), loaded_model.optimizer.variables())
def test_ppo_predict_batch(self): envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) batch_size = 3 obs_space = env.observation_space act_space = env.action_space act_dim = act_space.shape[0] obs = np.asarray([obs_space.sample() for _ in range(batch_size)]) act = model.predict(obs, det=True) self.assertArrayEqual((batch_size, act_dim), act.shape) act = model.predict(obs, det=False) self.assertArrayEqual((batch_size, act_dim), act.shape)
def test_ppo_train(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = n_samples * n_envs * n_subepochs / batch_size model.run(n_samples) model.train(batch_size, n_subepochs) self.assertEqual(exp_gradsteps, model.num_gradsteps) self.assertEqual(n_subepochs, model.num_subepochs)
def test_dqn_setup_image_obs(self): envs = [FakeImageEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env) self.assertEqual(3, model.n_envs) self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) # nature_cnn + value self.assertEqual((8 + 2) * 2, len(model.trainable_variables)) # test dueling model = dqn_model.DQN(env, dueling=True) # nature_cnn + value(dueling) self.assertEqual((8 + 4) * 2, len(model.trainable_variables))
def test_dqn_train(self, huber): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, huber=huber) n_samples = 10 batch_size = 10 n_gradsteps = 4 n_subepochs = 1 target_update = 2 exp_gradsteps = n_subepochs * n_gradsteps model.run(n_samples) model.train(batch_size, n_subepochs, n_gradsteps, target_update) self.assertEqual(exp_gradsteps, model.num_gradsteps) self.assertEqual(n_subepochs, model.num_subepochs)
def test_ppo_setup_non_image_obs(self): envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) # no share net, mlp model = ppo_model.PPO(env, mlp_units=[64, 64, 64]) self.assertEqual(3, model.n_envs) self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+6+3+2, len(model.trainable_variables)) # share net model = ppo_model.PPO(env, share_net=True, force_mlp=False, mlp_units=[64, 64, 64]) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+3+2, len(model.trainable_variables))
def test_ppo_train_with_target_kl(self): n_envs = 3 target_kl = 0.1 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(0) ub_utils.set_seed(0) model = ppo_model.PPO(env, target_kl=target_kl) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size model.run(n_samples) model.train(batch_size, n_subepochs) self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps) self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
def test_dqn_train_model(self, huber): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, huber=huber) n_samples = 10 batch_size = 2 model.run(n_samples) samp = model.sampler batch = samp.sample(batch_size) batch['next_obs'] = samp.rel[1]['obs'] losses, td = model._train_model(batch) for key, loss in losses.items(): self.assertArrayEqual([], loss.shape) self.assertFalse(np.all(np.isnan(loss))) self.assertArrayEqual((batch_size, ), td.shape) self.assertFalse(np.all(np.isnan(td))) self.assertTrue(np.all(td >= 0))
def test_dqn_prioritized(self, huber): n_envs = 3 ub_utils.set_seed(1) envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, prioritized=True, warmup_steps=5) self.assertTrue(isinstance(model.prio_beta, ub_sche.Scheduler)) self.assertTrue(isinstance(model.sampler, ub_data.PriorSampler)) n_samples = 10 batch_size = 10 model.run(n_samples) res = model.sampler._weight_tree[:n_samples * n_envs] exp = np.ones_like(res, dtype=np.float32) self.assertArrayEqual(exp, res) model._train_step(batch_size) res = model.sampler._weight_tree[:n_samples * n_envs] self.assertArrayNotEqual(exp, res) self.assertTrue(np.all(res >= 0.0))
def test_ppo_setup_image_obs(self): envs = [FakeImageEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) self.assertEqual(3, model.n_envs) self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) # nature_cnn + nature_cnn + policy + value self.assertEqual(8+8+2+2, len(model.trainable_variables)) # test share net model = ppo_model.PPO(env, share_net=True) # nature_cnn + policy + value self.assertEqual(8+2+2, len(model.trainable_variables)) # test force mlp model = ppo_model.PPO(env, share_net=False, force_mlp=True, mlp_units=[64, 64, 64]) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+6+2+2, len(model.trainable_variables))
def test_ppo_learn(self): n_envs = 4 n_steps = 125 n_subepochs = 2 n_epochs = 2 batch_size = 50 total_steps = n_envs * n_steps * n_epochs total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5) * n_subepochs * n_epochs) envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) eval_env = FakeContinuousEnv() env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO( env, batch_size=batch_size, n_steps=n_steps, n_subepochs=n_subepochs, ) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir model.learn( total_steps, log_interval=1, eval_env=eval_env, eval_interval=1, eval_episodes=1, eval_max_steps=10, save_path=save_path, save_interval=1, tb_logdir=save_path, reset_timesteps=True, verbose=3 ) # test load weights ppo_model.PPO.load(save_path) # test model state self.assertEqual(total_steps, model.num_timesteps) self.assertEqual(n_epochs, model.num_epochs) self.assertEqual(n_subepochs*n_epochs, model.num_subepochs) self.assertEqual(total_gradsteps, model.num_gradsteps) self.assertEqual(1.0, model.progress)
def test_dqn_td_error(self, huber): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, huber=huber) obs = env.observation_space.sample() obs = obs.reshape(1, *obs.shape) next_obs = env.observation_space.sample() next_obs = next_obs.reshape(1, *next_obs.shape) act = env.action_space.sample() act = np.asarray([act], dtype=np.int64) done = np.asarray([False], dtype=np.bool_) rew = np.asarray([1.0], dtype=np.float32) # test td error td = model.td_error(obs, act, done, rew, next_obs) self.assertArrayEqual((1, ), td.shape) self.assertFalse(np.all(np.isnan(td))) # test td loss loss = model.td_loss(td) self.assertArrayEqual([], loss.shape) self.assertFalse(np.all(np.isnan(loss)))
def test_ppo_gae(self): n_envs = 2 gamma = 0.99 lam = 0.95 envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) n_samples = 20 model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam) model.collect(n_samples) exp_gae = legacy_gae( rew = model.buffer.data['rew'], val = model.buffer.data['val'], done = model.buffer.data['done'], gamma = gamma, lam = lam ) env.seed(1) model.run(n_samples) gae = model.buffer.data['adv'] self.assertAllClose(exp_gae, gae)
def test_dqn_sample_nstep_batch(self): ub_utils.set_seed(1) n_envs = 3 gamma = 0.99 multi_step = 2 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, multi_step=multi_step, gamma=gamma) n_samples = 10 batch_size = 1 model.run(n_samples) samp = model.sampler batch = model._sample_nstep_batch(batch_size) orig_batch = samp.rel[0] self.assertArrayEqual(orig_batch['obs'], batch['obs']) self.assertArrayEqual(orig_batch['act'], batch['act']) self.assertArrayNotEqual(orig_batch['rew'], batch['rew']) next_batch = samp.rel[1] # depends on random seed self.assertAllClose(orig_batch['rew'] + gamma * next_batch['rew'], batch['rew'], atol=1e-6) nnext_batch = samp.rel[2] self.assertArrayEqual(nnext_batch['obs'], batch['next_obs'])