def test_prioritized_sampler_with_replay_buffer(self):
     capacity = 10
     batch = 1
     alpha = 1.0
     # test create space
     buf = ub_data.ReplayBuffer(capacity)
     self.assertTrue(buf.isnull)
     samp = ub_data.PrioritizedSampler(buf, alpha)
     self.assertTrue(samp._weight_tree is None)
     # add one sample to create space
     samp.add(i=[1])
     self.assertTrue(samp._weight_tree is not None)
     self.assertEqual(buf.capacity, samp._weight_tree._size)
     self.assertTrue(samp._weight_tree._base > buf.capacity)
     self.assertEqual(1, samp._weight_tree.sum())
     samp.add(i=[2])
     samp.add(i=[3])
     self.assertEqual(3, samp._weight_tree.sum())
     # test sample (batch=None)
     batch = samp.sample(beta=-1.0)
     self.assertEqual((3, ), batch['i'].shape)
     self.assertArrayEqual(np.ones((3, ), dtype=np.float32), batch['w'])
     # test sample (batch=2)
     ub_utils.set_seed(1)  # i=[2, 3]
     batch_size = 2
     batch = samp.sample(batch_size=batch_size, beta=-1.0)
     self.assertArrayEqual([1, 2], samp.indices[0])
     self.assertEqual((2, ), batch['i'].shape)
     self.assertArrayEqual(np.ones((2, ), dtype=np.float32), batch['w'])
     samp.update(w=[0.5, 0.5])
     self.assertAllClose(0.5, samp._min_w)  # exponent
     self.assertAllClose(1.0, samp._max_w)  # exponent
     self.assertAllClose(2, samp._weight_tree.sum())
     batches = []
     for n in range(10000):
         batch = samp.sample(batch_size=batch_size, beta=-1.0)  # i=[1, 1]
         batches.append(batch['i'])
     samples = np.asarray(batches).flatten()
     self.assertAllClose(0.5, np.sum(samples == 1) / (n * 2), atol=1e-2)
     self.assertAllClose(0.25, np.sum(samples == 2) / (n * 2), atol=1e-2)
     self.assertAllClose(0.25, np.sum(samples == 3) / (n * 2), atol=1e-2)
     # test sample (batch=3, seq=2)
     ub_utils.set_seed(2)  # i=[[1, 2], [1, 2], [2, 3]]
     batch_size = 3
     seq_len = 2
     batch = samp.sample(batch_size=batch_size, seq_len=seq_len, beta=-1.0)
     self.assertArrayEqual([[0, 1], [0, 1], [1, 2]], samp.indices[0])
     self.assertEqual((3, 2), batch['i'].shape)
     self.assertAllClose([[1, 0.5], [1, 0.5], [0.5, 0.5]],
                         batch['w'],
                         atol=1e-6)
     samp.update(w=[[0.1, 0.2], [1, 0.5], [0.4, 0.5]])
     self.assertAllClose(0.1, samp._min_w)
     self.assertAllClose(1.0, samp._max_w)
     self.assertAllClose(1.9, samp._weight_tree.sum())
Esempio n. 2
0
 def test_max_and_skip_env(self):
     # runable test
     skip = 4
     env = gym.make(TEST_ENV_ID)
     env = TimeLimit(env, 20)
     env = atari.MaxAndSkipEnv(env, skip=skip)
     env.seed(1)
     ub_utils.set_seed(1)
     env.reset()
     for i in range(20):
         obs, rew, done, info = env.step(env.action_space.sample())
         if done:
             break
     self.assertEqual(4, i)
Esempio n. 3
0
 def test_ppo_train_with_target_kl(self):
     n_envs = 3
     target_kl = 0.1
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(0)
     ub_utils.set_seed(0)
     model = ppo_model.PPO(env, target_kl=target_kl)
     n_samples = 10
     batch_size = 10
     n_subepochs = 4
     exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size
     model.run(n_samples)
     model.train(batch_size, n_subepochs) 
     self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps)
     self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
Esempio n. 4
0
 def _init():
     logger.Config.use(filename=a.logging,
                       level=a.log_level,
                       colored=True,
                       reset=True)
     set_seed(a.seed)
     env = gym.make(a.env_id)
     env = ub.envs.NoopResetEnv(env, noop_max=30)
     env = ub.envs.MaxAndSkipEnv(env, skip=4)
     #env = ub.envs.Monitor(env, root_dir=a.model_dir, prefix=f'{rank}.train', video=True)
     env = ub.envs.EpisodicLifeEnv(env)
     env = ub.envs.FireResetEnv(env)
     env = ub.envs.WarpFrame(env)
     env = ub.envs.ClipRewardEnv(env)
     env = ub.envs.FrameStack(env, 4)
     return env
Esempio n. 5
0
 def _init():
     logger.Config.use(filename=a.logging,
                       level=a.log_level,
                       colored=True,
                       reset=True)
     set_seed(a.seed)
     import pybullet_envs
     env = gym.make(a.env_id)
     env = TimeFeatureWrapper(env)
     env = SeedEnv(env, seed=a.seed + rank)
     if a.record_video:
         env = VideoRecorder(env,
                             os.path.join(a.monitor_dir, 'video/'),
                             prefix='train.{}'.format(rank),
                             force=True)
     env = Monitor(env, a.monitor_dir, prefix=str(rank), force=True)
     return env
Esempio n. 6
0
 def test_dqn_prioritized(self, huber):
     n_envs = 3
     ub_utils.set_seed(1)
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, prioritized=True, warmup_steps=5)
     self.assertTrue(isinstance(model.prio_beta, ub_sche.Scheduler))
     self.assertTrue(isinstance(model.sampler, ub_data.PriorSampler))
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     res = model.sampler._weight_tree[:n_samples * n_envs]
     exp = np.ones_like(res, dtype=np.float32)
     self.assertArrayEqual(exp, res)
     model._train_step(batch_size)
     res = model.sampler._weight_tree[:n_samples * n_envs]
     self.assertArrayNotEqual(exp, res)
     self.assertTrue(np.all(res >= 0.0))
Esempio n. 7
0
 def test_ppo_learn(self):
     n_envs = 4
     n_steps = 125
     n_subepochs = 2
     n_epochs = 2
     batch_size = 50
     total_steps = n_envs * n_steps * n_epochs
     total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5)
                         * n_subepochs * n_epochs)
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     eval_env = FakeContinuousEnv()
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(
         env, 
         batch_size=batch_size,
         n_steps=n_steps,
         n_subepochs=n_subepochs,
     )
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         model.learn(
             total_steps,
             log_interval=1,
             eval_env=eval_env,
             eval_interval=1,
             eval_episodes=1,
             eval_max_steps=10,
             save_path=save_path,
             save_interval=1,
             tb_logdir=save_path,
             reset_timesteps=True,
             verbose=3
         )
         # test load weights
         ppo_model.PPO.load(save_path)
         # test model state
         self.assertEqual(total_steps, model.num_timesteps)
         self.assertEqual(n_epochs, model.num_epochs)
         self.assertEqual(n_subepochs*n_epochs, model.num_subepochs)
         self.assertEqual(total_gradsteps, model.num_gradsteps)
         self.assertEqual(1.0, model.progress)
Esempio n. 8
0
 def test_ppo_save_load(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     ub_utils.set_seed(2)
     batch = next(iter(model.sampler(batch_size)))
     model._train_model(batch)
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         # save & load model
         model.save(save_path)
         loaded_model = ppo_model.PPO.load(save_path)
     # check model setup
     self.assertTrue(loaded_model.agent is not None)
     self.assertTrue(loaded_model.buffer is not None)
     self.assertTrue(loaded_model.optimizer is not None)
     # check if config is correctly restored
     model_config = model.get_config()
     loaded_config = loaded_model.get_config()
     self.assertEqual(set(model_config.keys()), set(loaded_config.keys()))
     for key in model_config:
         self.assertEqual(model_config[key], loaded_config[key])
     # check if all network variables are correctly restored
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # test optimizers
     # load optimizer params
     batches = [batch for batch in model.sampler(batch_size)]
     ub_utils.set_seed(1)
     for batch in batches:
         losses1, kl1 = model._train_step(batch)
     ub_utils.set_seed(1)
     for batch in batches:
         losses2, kl2 = loaded_model._train_step(batch)
     # check if losses are matched
     self.assertEqual(set(losses1.keys()), set(losses2.keys()))
     for key in losses1.keys():
         self.assertEqual(losses1[key], losses2[key])
     self.assertAllClose(kl1, kl2)
     # check if vars are same
     self.assertVariables(model.trainable_variables,
                     loaded_model.trainable_variables)
     # check if params of the optimizer are same
     self.assertVariables(model.optimizer.variables(),
                     loaded_model.optimizer.variables())
Esempio n. 9
0
 def test_ppo_param_order_non_delayed_vs_delayed(self):
     n_envs = 3
     envs = [FakeContinuousEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     obs_space = env.observation_space
     act_space = env.action_space
     model = ppo_model.PPO(env)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     batch = next(iter(model.sampler(batch_size)))
     ub_utils.set_seed(1)
     model._train_model(batch)
     # delayed
     model2 = ppo_model.PPO(None, observation_space=obs_space,
                                  action_space=act_space)
     model2.setup()
     ub_utils.set_seed(1)
     model2._train_model(batch)
     # check trainable variables order
     self.assertVariables(model.trainable_variables,
                         model2.trainable_variables)
     # check optimizer variables order
     self.assertVariables(model.optimizer.variables(),
                         model2.optimizer.variables())
Esempio n. 10
0
 def test_ppo_gae(self):
     n_envs = 2
     gamma = 0.99
     lam = 0.95
     envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     n_samples = 20
     model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam)
     model.collect(n_samples)
     exp_gae = legacy_gae(
         rew   = model.buffer.data['rew'], 
         val   = model.buffer.data['val'], 
         done  = model.buffer.data['done'], 
         gamma = gamma, 
         lam   = lam
     )
     env.seed(1)
     model.run(n_samples)
     gae = model.buffer.data['adv']
     self.assertAllClose(exp_gae, gae)
Esempio n. 11
0
 def test_dqn_save_load(self):
     n_envs = 3
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     env.seed(1)
     ub_utils.set_seed(1)
     model = dqn_model.DQN(env, warmup_steps=5)
     n_samples = 10
     batch_size = 10
     model.run(n_samples)
     # train for some steps
     ub_utils.set_seed(2)
     batch = model.sampler(batch_size)
     batch['next_obs'] = model.sampler.rel[1]['obs']
     model._train_model(batch)
     with tempfile.TemporaryDirectory() as tempdir:
         save_path = tempdir
         # save & load model
         model.save(save_path)
         loaded_model = dqn_model.DQN.load(save_path)
     # check model setup
     self.assertTrue(loaded_model.agent is not None)
     self.assertTrue(loaded_model.buffer is not None)
     self.assertTrue(loaded_model.optimizer is not None)
     # check if config is correctly restored
     model_config = model.get_config()
     loaded_config = loaded_model.get_config()
     self.assertEqual(set(model_config.keys()), set(loaded_config.keys()))
     for key in model_config:
         self.assertEqual(model_config[key], loaded_config[key], key)
     # check if all network variables are correctly restored
     self.assertVariables(model.trainable_variables,
                          loaded_model.trainable_variables)
     # test optimizers
     # load optimizer params
     batches = []
     for i in range(3):
         batch = model.sampler(batch_size)
         batch['next_obs'] = model.sampler.rel[1]['obs']
         batches.append(batch)
     ub_utils.set_seed(1)
     for batch in batches:
         losses1, td1 = model._train_model(batch)
     for batch in batches:
         losses2, td2 = loaded_model._train_model(batch)
     # check if losses are matches
     self.assertEqual(set(losses1.keys()), set(losses2.keys()))
     for key in losses1.keys():
         self.assertEqual(losses1[key], losses2[key])
     self.assertAllClose(td1, td2)
     # check if vars are same
     self.assertVariables(model.trainable_variables,
                          loaded_model.trainable_variables)
     # check if params of the optimizers are same
     self.assertVariables(model.optimizer.variables(),
                          loaded_model.optimizer.variables())
Esempio n. 12
0
 def test_dqn_sample_nstep_batch(self):
     ub_utils.set_seed(1)
     n_envs = 3
     gamma = 0.99
     multi_step = 2
     envs = [FakeImageEnv() for _ in range(n_envs)]
     env = ub_vec.VecEnv(envs)
     model = dqn_model.DQN(env, multi_step=multi_step, gamma=gamma)
     n_samples = 10
     batch_size = 1
     model.run(n_samples)
     samp = model.sampler
     batch = model._sample_nstep_batch(batch_size)
     orig_batch = samp.rel[0]
     self.assertArrayEqual(orig_batch['obs'], batch['obs'])
     self.assertArrayEqual(orig_batch['act'], batch['act'])
     self.assertArrayNotEqual(orig_batch['rew'], batch['rew'])
     next_batch = samp.rel[1]
     # depends on random seed
     self.assertAllClose(orig_batch['rew'] + gamma * next_batch['rew'],
                         batch['rew'],
                         atol=1e-6)
     nnext_batch = samp.rel[2]
     self.assertArrayEqual(nnext_batch['obs'], batch['next_obs'])
Esempio n. 13
0
def set_test_seed():
    utils.set_seed(TEST_SEED)
Esempio n. 14
0
 def test_permute_sampler_with_dynamic_buffer(self):
     n_samples = 10
     batch_ = 2
     buf = ub_data.DynamicBuffer(batch_)
     samp = ub_data.PermuteSampler(buf)
     for i in range(n_samples):
         buf.add(a=[i, i + n_samples])
     self.assertEqual(n_samples * 2, len(buf))
     self.assertFalse(buf.ready_for_sample)
     self.assertFalse(buf.isfull)
     # test sample (batch=None)
     with self.assertRaises(RuntimeError):
         # buffer is not ready for sampling
         samp()
     buf.make()
     # test sample (batch=None)
     batch_size = len(buf)
     batches = []
     indices = []
     ub_utils.set_seed(2)
     for batch in samp():
         self.assertArrayEqual((batch_size, ), batch['a'].shape)
         self.assertArrayEqual(buf[samp.indices]['a'], batch['a'])
         batches.append(batch)
         indices.append(
             np.ravel_multi_index(samp.indices,
                                  (buf.len_slots(), buf.batch)))
     self.assertEqual(1, len(batches))
     unique, counts = np.unique(indices, return_counts=True)
     # check if contains all elements
     self.assertTrue(len(buf), len(unique))
     # check if all elements are sampled at least once
     self.assertTrue(np.all(counts == 1))
     # test sample (batch=3)
     batch_size = 3
     batches = []
     indices = []
     for batch in samp(batch_size=batch_size):
         self.assertArrayEqual((batch_size, ), batch['a'].shape)
         self.assertArrayEqual(buf[samp.indices]['a'], batch['a'])
         batches.append(batch)
         indices.append(
             np.ravel_multi_index(samp.indices,
                                  (buf.len_slots(), buf.batch)))
     self.assertEqual(7, len(batches))  # total samples == capacity
     unique, counts = np.unique(indices, return_counts=True)
     # check if contains all elements
     self.assertTrue(len(buf), len(unique))
     # check if all elements are sampled at least once but less than 2
     self.assertTrue(np.all(counts >= 1))
     self.assertTrue(np.all(counts <= 2))
     # test sample (batch=3, seq_len=2)
     batch_size = 3
     seq_len = 2
     batches = []
     indices = []
     ub_utils.set_seed(2)
     for batch in samp(batch_size=batch_size, seq_len=seq_len):
         self.assertArrayEqual((batch_size, seq_len), batch['a'].shape)
         self.assertArrayEqual(buf[samp.indices]['a'], batch['a'])
         batches.append(batch)
         indices.append(
             np.ravel_multi_index(samp.indices,
                                  (buf.len_slots(), buf.batch)))
     self.assertEqual(6, len(batches))  # total samples == capacity
     unique, counts = np.unique(indices, return_counts=True)
     # check if contains all elements
     self.assertTrue(len(buf), len(unique))
     # check if all elements are sampled at least once but less than 3
     self.assertTrue(np.all(counts >= 1))
     self.assertTrue(np.all(counts <= 3))
Esempio n. 15
0
    LOG.add_row('Verbose', a.verbose)
    LOG.add_line()
    LOG.add_row('Shared network', a.shared_net)
    LOG.add_row('Force MLP', a.force_mlp)
    LOG.add_row('Learning rate', a.lr)
    LOG.add_row('Gamma', a.gamma)
    LOG.add_row('Lambda', a.gae_lambda)
    LOG.add_row('Clip range', a.clip_range)
    LOG.add_row('Value clip range', a.clip_range_vf)
    LOG.add_row('Entropy coef', a.ent_coef)
    LOG.add_row('Value coef', a.vf_coef)
    LOG.add_row('Max gradient norm', a.max_grad_norm)
    LOG.add_row('Target KL', a.target_kl)
    LOG.flush('WARNING')

    set_seed(a.seed)

    # === Make envs ===

    if 'NoFrameskip' in a.env_id:
        # Atari env
        env = make_atari(a, eval=False)
        eval_env = make_atari(a, eval=True)
    else:
        # Pybullet env
        env = make_env(a, eval=False)
        eval_env = make_env(a, eval=True)

    env.seed(a.seed)
    eval_env.seed(a.eval_seed)
    LOG.debug('Action space: {}'.format(env.action_space))
Esempio n. 16
0
 def test_set_seed(self):
     utils.set_seed(1)
     a = np.random.normal(size=(3,))
     utils.set_seed(1)
     b = np.random.normal(size=(3,))
     self.assertArrayEqual(a, b)