def test_prioritized_sampler_with_replay_buffer(self): capacity = 10 batch = 1 alpha = 1.0 # test create space buf = ub_data.ReplayBuffer(capacity) self.assertTrue(buf.isnull) samp = ub_data.PrioritizedSampler(buf, alpha) self.assertTrue(samp._weight_tree is None) # add one sample to create space samp.add(i=[1]) self.assertTrue(samp._weight_tree is not None) self.assertEqual(buf.capacity, samp._weight_tree._size) self.assertTrue(samp._weight_tree._base > buf.capacity) self.assertEqual(1, samp._weight_tree.sum()) samp.add(i=[2]) samp.add(i=[3]) self.assertEqual(3, samp._weight_tree.sum()) # test sample (batch=None) batch = samp.sample(beta=-1.0) self.assertEqual((3, ), batch['i'].shape) self.assertArrayEqual(np.ones((3, ), dtype=np.float32), batch['w']) # test sample (batch=2) ub_utils.set_seed(1) # i=[2, 3] batch_size = 2 batch = samp.sample(batch_size=batch_size, beta=-1.0) self.assertArrayEqual([1, 2], samp.indices[0]) self.assertEqual((2, ), batch['i'].shape) self.assertArrayEqual(np.ones((2, ), dtype=np.float32), batch['w']) samp.update(w=[0.5, 0.5]) self.assertAllClose(0.5, samp._min_w) # exponent self.assertAllClose(1.0, samp._max_w) # exponent self.assertAllClose(2, samp._weight_tree.sum()) batches = [] for n in range(10000): batch = samp.sample(batch_size=batch_size, beta=-1.0) # i=[1, 1] batches.append(batch['i']) samples = np.asarray(batches).flatten() self.assertAllClose(0.5, np.sum(samples == 1) / (n * 2), atol=1e-2) self.assertAllClose(0.25, np.sum(samples == 2) / (n * 2), atol=1e-2) self.assertAllClose(0.25, np.sum(samples == 3) / (n * 2), atol=1e-2) # test sample (batch=3, seq=2) ub_utils.set_seed(2) # i=[[1, 2], [1, 2], [2, 3]] batch_size = 3 seq_len = 2 batch = samp.sample(batch_size=batch_size, seq_len=seq_len, beta=-1.0) self.assertArrayEqual([[0, 1], [0, 1], [1, 2]], samp.indices[0]) self.assertEqual((3, 2), batch['i'].shape) self.assertAllClose([[1, 0.5], [1, 0.5], [0.5, 0.5]], batch['w'], atol=1e-6) samp.update(w=[[0.1, 0.2], [1, 0.5], [0.4, 0.5]]) self.assertAllClose(0.1, samp._min_w) self.assertAllClose(1.0, samp._max_w) self.assertAllClose(1.9, samp._weight_tree.sum())
def test_max_and_skip_env(self): # runable test skip = 4 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 20) env = atari.MaxAndSkipEnv(env, skip=skip) env.seed(1) ub_utils.set_seed(1) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break self.assertEqual(4, i)
def test_ppo_train_with_target_kl(self): n_envs = 3 target_kl = 0.1 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(0) ub_utils.set_seed(0) model = ppo_model.PPO(env, target_kl=target_kl) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size model.run(n_samples) model.train(batch_size, n_subepochs) self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps) self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
def _init(): logger.Config.use(filename=a.logging, level=a.log_level, colored=True, reset=True) set_seed(a.seed) env = gym.make(a.env_id) env = ub.envs.NoopResetEnv(env, noop_max=30) env = ub.envs.MaxAndSkipEnv(env, skip=4) #env = ub.envs.Monitor(env, root_dir=a.model_dir, prefix=f'{rank}.train', video=True) env = ub.envs.EpisodicLifeEnv(env) env = ub.envs.FireResetEnv(env) env = ub.envs.WarpFrame(env) env = ub.envs.ClipRewardEnv(env) env = ub.envs.FrameStack(env, 4) return env
def _init(): logger.Config.use(filename=a.logging, level=a.log_level, colored=True, reset=True) set_seed(a.seed) import pybullet_envs env = gym.make(a.env_id) env = TimeFeatureWrapper(env) env = SeedEnv(env, seed=a.seed + rank) if a.record_video: env = VideoRecorder(env, os.path.join(a.monitor_dir, 'video/'), prefix='train.{}'.format(rank), force=True) env = Monitor(env, a.monitor_dir, prefix=str(rank), force=True) return env
def test_dqn_prioritized(self, huber): n_envs = 3 ub_utils.set_seed(1) envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, prioritized=True, warmup_steps=5) self.assertTrue(isinstance(model.prio_beta, ub_sche.Scheduler)) self.assertTrue(isinstance(model.sampler, ub_data.PriorSampler)) n_samples = 10 batch_size = 10 model.run(n_samples) res = model.sampler._weight_tree[:n_samples * n_envs] exp = np.ones_like(res, dtype=np.float32) self.assertArrayEqual(exp, res) model._train_step(batch_size) res = model.sampler._weight_tree[:n_samples * n_envs] self.assertArrayNotEqual(exp, res) self.assertTrue(np.all(res >= 0.0))
def test_ppo_learn(self): n_envs = 4 n_steps = 125 n_subepochs = 2 n_epochs = 2 batch_size = 50 total_steps = n_envs * n_steps * n_epochs total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5) * n_subepochs * n_epochs) envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) eval_env = FakeContinuousEnv() env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO( env, batch_size=batch_size, n_steps=n_steps, n_subepochs=n_subepochs, ) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir model.learn( total_steps, log_interval=1, eval_env=eval_env, eval_interval=1, eval_episodes=1, eval_max_steps=10, save_path=save_path, save_interval=1, tb_logdir=save_path, reset_timesteps=True, verbose=3 ) # test load weights ppo_model.PPO.load(save_path) # test model state self.assertEqual(total_steps, model.num_timesteps) self.assertEqual(n_epochs, model.num_epochs) self.assertEqual(n_subepochs*n_epochs, model.num_subepochs) self.assertEqual(total_gradsteps, model.num_gradsteps) self.assertEqual(1.0, model.progress)
def test_ppo_save_load(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps ub_utils.set_seed(2) batch = next(iter(model.sampler(batch_size))) model._train_model(batch) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir # save & load model model.save(save_path) loaded_model = ppo_model.PPO.load(save_path) # check model setup self.assertTrue(loaded_model.agent is not None) self.assertTrue(loaded_model.buffer is not None) self.assertTrue(loaded_model.optimizer is not None) # check if config is correctly restored model_config = model.get_config() loaded_config = loaded_model.get_config() self.assertEqual(set(model_config.keys()), set(loaded_config.keys())) for key in model_config: self.assertEqual(model_config[key], loaded_config[key]) # check if all network variables are correctly restored self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # test optimizers # load optimizer params batches = [batch for batch in model.sampler(batch_size)] ub_utils.set_seed(1) for batch in batches: losses1, kl1 = model._train_step(batch) ub_utils.set_seed(1) for batch in batches: losses2, kl2 = loaded_model._train_step(batch) # check if losses are matched self.assertEqual(set(losses1.keys()), set(losses2.keys())) for key in losses1.keys(): self.assertEqual(losses1[key], losses2[key]) self.assertAllClose(kl1, kl2) # check if vars are same self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # check if params of the optimizer are same self.assertVariables(model.optimizer.variables(), loaded_model.optimizer.variables())
def test_ppo_param_order_non_delayed_vs_delayed(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) obs_space = env.observation_space act_space = env.action_space model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps batch = next(iter(model.sampler(batch_size))) ub_utils.set_seed(1) model._train_model(batch) # delayed model2 = ppo_model.PPO(None, observation_space=obs_space, action_space=act_space) model2.setup() ub_utils.set_seed(1) model2._train_model(batch) # check trainable variables order self.assertVariables(model.trainable_variables, model2.trainable_variables) # check optimizer variables order self.assertVariables(model.optimizer.variables(), model2.optimizer.variables())
def test_ppo_gae(self): n_envs = 2 gamma = 0.99 lam = 0.95 envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) n_samples = 20 model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam) model.collect(n_samples) exp_gae = legacy_gae( rew = model.buffer.data['rew'], val = model.buffer.data['val'], done = model.buffer.data['done'], gamma = gamma, lam = lam ) env.seed(1) model.run(n_samples) gae = model.buffer.data['adv'] self.assertAllClose(exp_gae, gae)
def test_dqn_save_load(self): n_envs = 3 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) model = dqn_model.DQN(env, warmup_steps=5) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps ub_utils.set_seed(2) batch = model.sampler(batch_size) batch['next_obs'] = model.sampler.rel[1]['obs'] model._train_model(batch) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir # save & load model model.save(save_path) loaded_model = dqn_model.DQN.load(save_path) # check model setup self.assertTrue(loaded_model.agent is not None) self.assertTrue(loaded_model.buffer is not None) self.assertTrue(loaded_model.optimizer is not None) # check if config is correctly restored model_config = model.get_config() loaded_config = loaded_model.get_config() self.assertEqual(set(model_config.keys()), set(loaded_config.keys())) for key in model_config: self.assertEqual(model_config[key], loaded_config[key], key) # check if all network variables are correctly restored self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # test optimizers # load optimizer params batches = [] for i in range(3): batch = model.sampler(batch_size) batch['next_obs'] = model.sampler.rel[1]['obs'] batches.append(batch) ub_utils.set_seed(1) for batch in batches: losses1, td1 = model._train_model(batch) for batch in batches: losses2, td2 = loaded_model._train_model(batch) # check if losses are matches self.assertEqual(set(losses1.keys()), set(losses2.keys())) for key in losses1.keys(): self.assertEqual(losses1[key], losses2[key]) self.assertAllClose(td1, td2) # check if vars are same self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # check if params of the optimizers are same self.assertVariables(model.optimizer.variables(), loaded_model.optimizer.variables())
def test_dqn_sample_nstep_batch(self): ub_utils.set_seed(1) n_envs = 3 gamma = 0.99 multi_step = 2 envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = dqn_model.DQN(env, multi_step=multi_step, gamma=gamma) n_samples = 10 batch_size = 1 model.run(n_samples) samp = model.sampler batch = model._sample_nstep_batch(batch_size) orig_batch = samp.rel[0] self.assertArrayEqual(orig_batch['obs'], batch['obs']) self.assertArrayEqual(orig_batch['act'], batch['act']) self.assertArrayNotEqual(orig_batch['rew'], batch['rew']) next_batch = samp.rel[1] # depends on random seed self.assertAllClose(orig_batch['rew'] + gamma * next_batch['rew'], batch['rew'], atol=1e-6) nnext_batch = samp.rel[2] self.assertArrayEqual(nnext_batch['obs'], batch['next_obs'])
def set_test_seed(): utils.set_seed(TEST_SEED)
def test_permute_sampler_with_dynamic_buffer(self): n_samples = 10 batch_ = 2 buf = ub_data.DynamicBuffer(batch_) samp = ub_data.PermuteSampler(buf) for i in range(n_samples): buf.add(a=[i, i + n_samples]) self.assertEqual(n_samples * 2, len(buf)) self.assertFalse(buf.ready_for_sample) self.assertFalse(buf.isfull) # test sample (batch=None) with self.assertRaises(RuntimeError): # buffer is not ready for sampling samp() buf.make() # test sample (batch=None) batch_size = len(buf) batches = [] indices = [] ub_utils.set_seed(2) for batch in samp(): self.assertArrayEqual((batch_size, ), batch['a'].shape) self.assertArrayEqual(buf[samp.indices]['a'], batch['a']) batches.append(batch) indices.append( np.ravel_multi_index(samp.indices, (buf.len_slots(), buf.batch))) self.assertEqual(1, len(batches)) unique, counts = np.unique(indices, return_counts=True) # check if contains all elements self.assertTrue(len(buf), len(unique)) # check if all elements are sampled at least once self.assertTrue(np.all(counts == 1)) # test sample (batch=3) batch_size = 3 batches = [] indices = [] for batch in samp(batch_size=batch_size): self.assertArrayEqual((batch_size, ), batch['a'].shape) self.assertArrayEqual(buf[samp.indices]['a'], batch['a']) batches.append(batch) indices.append( np.ravel_multi_index(samp.indices, (buf.len_slots(), buf.batch))) self.assertEqual(7, len(batches)) # total samples == capacity unique, counts = np.unique(indices, return_counts=True) # check if contains all elements self.assertTrue(len(buf), len(unique)) # check if all elements are sampled at least once but less than 2 self.assertTrue(np.all(counts >= 1)) self.assertTrue(np.all(counts <= 2)) # test sample (batch=3, seq_len=2) batch_size = 3 seq_len = 2 batches = [] indices = [] ub_utils.set_seed(2) for batch in samp(batch_size=batch_size, seq_len=seq_len): self.assertArrayEqual((batch_size, seq_len), batch['a'].shape) self.assertArrayEqual(buf[samp.indices]['a'], batch['a']) batches.append(batch) indices.append( np.ravel_multi_index(samp.indices, (buf.len_slots(), buf.batch))) self.assertEqual(6, len(batches)) # total samples == capacity unique, counts = np.unique(indices, return_counts=True) # check if contains all elements self.assertTrue(len(buf), len(unique)) # check if all elements are sampled at least once but less than 3 self.assertTrue(np.all(counts >= 1)) self.assertTrue(np.all(counts <= 3))
LOG.add_row('Verbose', a.verbose) LOG.add_line() LOG.add_row('Shared network', a.shared_net) LOG.add_row('Force MLP', a.force_mlp) LOG.add_row('Learning rate', a.lr) LOG.add_row('Gamma', a.gamma) LOG.add_row('Lambda', a.gae_lambda) LOG.add_row('Clip range', a.clip_range) LOG.add_row('Value clip range', a.clip_range_vf) LOG.add_row('Entropy coef', a.ent_coef) LOG.add_row('Value coef', a.vf_coef) LOG.add_row('Max gradient norm', a.max_grad_norm) LOG.add_row('Target KL', a.target_kl) LOG.flush('WARNING') set_seed(a.seed) # === Make envs === if 'NoFrameskip' in a.env_id: # Atari env env = make_atari(a, eval=False) eval_env = make_atari(a, eval=True) else: # Pybullet env env = make_env(a, eval=False) eval_env = make_env(a, eval=True) env.seed(a.seed) eval_env.seed(a.eval_seed) LOG.debug('Action space: {}'.format(env.action_space))
def test_set_seed(self): utils.set_seed(1) a = np.random.normal(size=(3,)) utils.set_seed(1) b = np.random.normal(size=(3,)) self.assertArrayEqual(a, b)