def test_basic(self): """ Basic Test Case Loaded buffer have same transitions with saved one. """ buffer_size = 4 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size, env_dict) rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) a = [1, 2, 3, 4] rb1.add(a=a) fname = "basic.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
class ReplayBuffer: def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"): super().__init__() self.done_string = done_string self.min_storage = min_storage cpprb_args = { "size": size, "env_dict": env_dict, "Nstep": n_step_dict } self.buffer = CPPRB(**cpprb_args) def add(self, data: Sequence[Dict[str, np.ndarray]]) -> None: for d in data: self.buffer.add(**d) if d[self.done_string]: self.buffer.on_episode_end() def sample(self, size: int) -> Dict[str, np.ndarray]: if self.buffer.get_stored_size() < self.min_storage: print( f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage" + f"size {self.min_storage}. Returning None." ) return None else: return self.buffer.sample(size)
def explorer(global_rb,env_dict,is_training_done,queue): local_buffer_size = int(1e+2) local_rb = ReplayBuffer(local_buffer_size,env_dict) model = MyModel() env = gym.make("CartPole-v1") obs = env.reset() while not is_training_done.is_set(): if not queue.empty(): w = queue.get() model.weights = w action = model.get_action(obs) next_obs, reward, done, _ = env.step(action) local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done) if done: local_rb.on_episode_end() obs = env.reset() else: obs = next_obs if local_rb.get_stored_size() == local_buffer_size: local_sample = local_rb.get_all_transitions() local_rb.clear() absTD = model.abs_TD_error(local_sample) global_rb.add(**local_sample,priorities=absTD)
def test_buffer(self): buffer_size = 256 obs_shape = (15,15) act_dim = 5 N = 512 erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape}, "act":{"shape": act_dim}, "rew":{}, "next_obs":{"shape": obs_shape}, "done":{}}) for i in range(N): obs = np.full(obs_shape,i,dtype=np.double) act = np.full(act_dim,i,dtype=np.double) rew = i next_obs = obs + 1 done = 0 erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done) es = erb._encode_sample(range(buffer_size)) erb.sample(32) erb.clear() self.assertEqual(erb.get_next_index(),0) self.assertEqual(erb.get_stored_size(),0)
class buffer_class: def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) #override the observation length in the replay memory env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )} env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )} print('!!!!', env_dict['obs']) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict) def append(self, s, a, r, done, sp): self.storage.add( **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp)) def sample(self, batch_size): batch = self.storage.sample(batch_size) s_matrix = batch['obs'] a_matrix = batch['act'] r_matrix = batch['rew'] done_matrix = batch['done'] sp_matrix = batch['next_obs'] return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix def __len__(self): return self.storage.get_stored_size()
def test_stack_compress(self): bsize = 10 odim = 2 ssize = 2 rb = ReplayBuffer(bsize, {"a": { "shape": (odim, ssize) }}, stack_compress="a") a = np.random.rand(odim, bsize + ssize - 1) for i in range(bsize): rb.add(a=a[:, i:i + ssize]) _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[:, i:i + ssize]) rb.on_episode_end() _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i])
def test_with_one(self): buffer_size = 32 obs_shape = 3 act_shape = 4 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape }, "act": { "shape": act_shape }, "done": {} }) v = { "obs": np.ones(shape=obs_shape), "act": np.zeros(shape=act_shape), "done": 0 } rb.add(**v) tx = rb.get_all_transitions() for key in ["obs", "act", "done"]: with self.subTest(key=key): np.testing.assert_allclose(tx[key], np.asarray(v[key]).reshape((1, -1)))
class Agent: def __init__(self, learn_rate, state_shape, num_actions, batch_size): self.mem_size=100000 self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( self.mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": { }, "next_obs": { "shape": state_shape }, "done": { "shape": 1 }}) self.net = Network(learn_rate, state_shape, num_actions) def choose_action(self, observation): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size) states = torch.tensor( batch["obs"] ).to(self.net.device) actions = torch.tensor( batch["act"], dtype=torch.int64).to(self.net.device).T[0] rewards = torch.tensor( batch["rew"] ).to(self.net.device).T[0] states_ = torch.tensor( batch["next_obs"] ).to(self.net.device) dones = torch.tensor( batch["done"], dtype=torch.bool ).to(self.net.device).T[0] batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = (td ** 2.0).mean() loss.backward() self.net.optimizer.step() self.net.reset_noise()
def test_memmap(self): rb = ReplayBuffer(32, {"done": {}}, mmap_prefix="mmap") for _ in range(1000): rb.add(done=0.0) self.assertTrue(os.path.exists("mmap_done.dat"))
def test_nstep_with_memory_compress(self): rb = ReplayBuffer(32, { "obs": { "shape": (16, 16) }, 'rew': {}, 'done': {} }, next_of="obs", stack_compress="obs", Nstep={ "size": 4, "rew": "rew" }) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertEqual( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), 0)
def test_Nstep_incompatibility(self): """ Raise ValueError when Nstep incompatibility """ buffer_size = 10 env_dict = {"done": {}} Nstep = {"size": 3, "gamma": 0.99} rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) d = [0, 0, 0, 0, 1] rb1.add(done=d) rb1.on_episode_end() fname="Nstep_raise.npz" rb1.save_transitions(fname) with self.assertRaises(ValueError): rb2.load_transitions(fname) with self.assertRaises(ValueError): rb3.load_transitions(v(1,fname))
def test_unsafe_next_of_stack_compress(self): """ Load next_of and stack_compress transitions """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]] rb1.add(a=a[:-1], next_a=a[1:]) fname="unsafe_next_of_stack_compress.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_stack_compress(self): """ Load stack_compress transitions """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb3 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] rb1.add(a=a) fname="stack_compress.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
def test_incompatible_unsafe_stack_compress(self): """ Load incompatible stack_compress transitions with unsafe mode """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] rb1.add(a=a) fname="incompatible_unsafe_stack_compress.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(fname) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
def test_incompatible_unsafe_next_of(self): """ Load incompatible next_of transitions with unsafe mode """ buffer_size = 10 env_dict1 = {"a": {}} env_dict2 = {"a": {}, "next_a": {}} rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict2) rb3 = ReplayBuffer(buffer_size, env_dict2) a = [1, 2, 3, 4, 5, 6] rb1.add(a=a[:-1], next_a=a[1:]) fname="unsafe_incompatible_next_of.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_fulled_unsafe_next_of(self): """ Load with already fulled buffer """ buffer_size = 10 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a") a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] rb1.add(a=a[:-1], next_a=a[1:]) fname="fulled_unsafe_next_of.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_train(self): agent = DQN( state_shape=self.env.observation_space.shape, action_dim=self.env.action_space.n, memory_capacity=100, gpu=-1) from cpprb import ReplayBuffer replay_buffer = ReplayBuffer( obs_dim=self.env.observation_space.shape, act_dim=1, size=agent.memory_capacity) obs = self.env.reset() for _ in range(100): action = agent.get_action(obs) next_obs, reward, done, _ = self.env.step(action) replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if done: next_obs = self.env.reset() obs = next_obs for _ in range(100): samples = replay_buffer.sample(agent.batch_size) agent.train(samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float64))
def test_next_of(self): """ Load next_of transitions with safe mode For safe mode, next_of is not neccessary at loaded buffer. """ buffer_size = 10 env_dict1 = {"a": {}} env_dict2 = {"a": {}, "next_a": {}} rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict2) rb3 = ReplayBuffer(buffer_size, env_dict2) a = [1, 2, 3, 4, 5, 6] rb1.add(a=a[:-1], next_a=a[1:]) fname="next_of.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_load_Nstep(self): """ Load Nstep transitions """ buffer_size = 10 env_dict = {"done": {}} Nstep = {"size": 3, "gamma": 0.99} rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb2 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb3 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) d = [0, 0, 0, 0, 1] rb1.add(done=d) rb1.on_episode_end() fname="Nstep.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["done"], t2["done"]) np.testing.assert_allclose(t1["done"], t3["done"])
def test(self): buffer_size = 256 obs_dim = 3 act_dim = 1 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) obs = np.ones(shape=(obs_dim)) act = np.ones(shape=(act_dim)) rew = 0 next_obs = np.ones(shape=(obs_dim)) done = 0 for i in range(500): rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done) batch_size = 32 sample = rb.sample(batch_size)
def test_has_next_of(self): bsize = 10 rb = ReplayBuffer(bsize, {"a": {}}, next_of="a") a = np.random.rand(bsize + 1) for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) rb.on_episode_end() _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i])
def test_nstep(self): rb = ReplayBuffer(32,{'rew': {}, 'done': {}}, Nstep={"size": 4, "rew": "rew"}) self.assertIs(rb.add(rew=1,done=0),None) self.assertIs(rb.add(rew=1,done=0),None) self.assertIs(rb.add(rew=1,done=0),None) self.assertEqual(rb.add(rew=1,done=0),0)
def test_multistep_add(self): rb = ReplayBuffer(4, {"done": {}}) done = jnp.asarray([1,1,1]) for i in range(2): with self.subTest(i=i): rb.add(done=done)
def test_python_type(self): types = [bool, int, float] for d in types: with self.subTest(type=d): b = ReplayBuffer(10, {"a": {"dtype": d}}) b.add(a=d(1)) self.assertEqual(b.get_all_transitions()["a"].dtype, d)
def test_add(self): rb = ReplayBuffer(4, {"done": {}}) done = jnp.asarray(1) for i in range(5): with self.subTest(i=i): rb.add(done=done)
def set_replay_buffer(self, env, get_from_file): obs_dim = env.observation_space.shape act_dim = env.action_space.shape if get_from_file: print(colorize("Pulling saved expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) f = open(self._demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size= self.replay_buffer_size, env_dict={ "obs": {"shape": obs_dim}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": obs_dim}, "done": {}}) replay_buffer.add(**before_add(obs=np_states[~np_dones], act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=np_next_states[~np_dones], done=np_next_dones[~np_dones])) self.replay_buffer = replay_buffer else: # Generate expert data print(colorize( "Generating expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) # Load trained policy _, get_action = load_policy_and_env(osp.join(self._root_data_path, self.file_name, self.file_name + '_s0/'), 'last', False) expert_rb = run_policy(env, get_action, 0, self.expert_episodes, False, record=not get_from_file, record_name='expert_' + self.file_name + '_' + str(self.expert_episodes) + '_runs', record_project='clone_benchmarking_' + self.config_name, data_path= self._expert_path, config_name= self.config_name, max_len_rb=self.replay_buffer_size) self.replay_buffer = expert_rb
def test_cache_next_of(self): stack_size = 3 episode_len = 5 rb = ReplayBuffer(32, {"obs": { "shape": (stack_size), "dtype": np.int }}, next_of="obs", stack_compress="obs") obs = np.arange(episode_len + stack_size + 2, dtype=np.int) # [0,1,...,episode_len+stack_size+1] obs2 = obs + 3 * episode_len # [3*episode_len,...,4*episode_len+stack_size+1] # Add 1st episode for i in range(episode_len): rb.add(obs=obs[i:i + stack_size], next_obs=obs[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Reset environment rb.on_episode_end() s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Add 2nd episode for i in range(episode_len): rb.add(obs=obs2[i:i + stack_size], next_obs=obs2[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), 2 * episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len): with self.subTest(i=i + episode_len): np.testing.assert_equal(s["obs"][i + episode_len], obs2[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i + episode_len], obs2[i + 1:i + 1 + stack_size])
def tet_nstep_multistep_add(self): rb = ReplayBuffer(6, {"obs": {}, "rew": {}, "done": {}, "next_obs":{}}, Nstep={"size": 4, "rew": "rew", "next": "next_obs"}) obs = jnp.asarray([1,1,1,1]) rew = jnp.asarray([1,1,1,1]) done = jnp.asarray([1,1,1,1]) next_obs = jnp.asarray([1,1,1,1]) for i in range(7): with self.subTest(i=i): rb.add(obs=obs, rew=rew, done=done, next_obs=next_obs)
def set_multiple_replay_buffers(self, env): print(self.config_name_list) obs_dim = env.observation_space.shape act_dim = env.action_space.shape print(colorize("Pulling saved trajectories from two experts ( %s and %s) from files over %d episodes" % (self.config_name_list[0], self.config_name_list[1], self.expert_episodes), 'blue', bold=True)) rb_list = [] v = 0 for x in self.config_name_list: _expert_demo_dir = os.path.join(self._expert_path, x + '_episodes/') f = open(_expert_demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size=self.replay_buffer_size, env_dict={ "obs": {"shape": tuple([obs_dim[0]+2,])}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": tuple([obs_dim[0]+2,])}, "done": {}}) # Concatenate the states with one hot vectors depending on class extend1 = [one_hot(np.array([v]), self.n_experts)] * np_states[~np_dones].shape[0] appended_states = np.append(np_states[~np_dones], np.c_[extend1], 1) appended_next_states = np.append(np_next_states[~np_dones], np.c_[extend1], 1) replay_buffer.add(**before_add(obs=appended_states, act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=appended_next_states, done=np_next_dones[~np_dones])) rb_list.append(replay_buffer) v += 1 self.rb_list = rb_list
def test_shuffle_transitions(self): rb = ReplayBuffer(64, {"a": {}}) a = np.arange(64) rb.add(a=a) s1 = rb.get_all_transitions()["a"] s2 = rb.get_all_transitions(shuffle=True)["a"] self.assertFalse((s1 == s2).all()) s = np.intersect1d(s1, s2, assume_unique=True) np.testing.assert_allclose(np.ravel(s), np.ravel(s1))