def test_memmap(self): rb = ReplayBuffer(32, {"done": {}}, mmap_prefix="mmap") for _ in range(1000): rb.add(done=0.0) self.assertTrue(os.path.exists("mmap_done.dat"))
def test(self): buffer_size = 256 obs_dim = 3 act_dim = 1 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) obs = np.ones(shape=(obs_dim)) act = np.ones(shape=(act_dim)) rew = 0 next_obs = np.ones(shape=(obs_dim)) done = 0 for i in range(500): rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done) batch_size = 32 sample = rb.sample(batch_size)
def explorer(global_rb,env_dict,is_training_done,queue): local_buffer_size = int(1e+2) local_rb = ReplayBuffer(local_buffer_size,env_dict) model = MyModel() env = gym.make("CartPole-v1") obs = env.reset() while not is_training_done.is_set(): if not queue.empty(): w = queue.get() model.weights = w action = model.get_action(obs) next_obs, reward, done, _ = env.step(action) local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done) if done: local_rb.on_episode_end() obs = env.reset() else: obs = next_obs if local_rb.get_stored_size() == local_buffer_size: local_sample = local_rb.get_all_transitions() local_rb.clear() absTD = model.abs_TD_error(local_sample) global_rb.add(**local_sample,priorities=absTD)
def test_train(self): agent = DQN( state_shape=self.env.observation_space.shape, action_dim=self.env.action_space.n, memory_capacity=100, gpu=-1) from cpprb import ReplayBuffer replay_buffer = ReplayBuffer( obs_dim=self.env.observation_space.shape, act_dim=1, size=agent.memory_capacity) obs = self.env.reset() for _ in range(100): action = agent.get_action(obs) next_obs, reward, done, _ = self.env.step(action) replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if done: next_obs = self.env.reset() obs = next_obs for _ in range(100): samples = replay_buffer.sample(agent.batch_size) agent.train(samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float64))
class buffer_class: def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) #override the observation length in the replay memory env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )} env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )} print('!!!!', env_dict['obs']) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict) def append(self, s, a, r, done, sp): self.storage.add( **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp)) def sample(self, batch_size): batch = self.storage.sample(batch_size) s_matrix = batch['obs'] a_matrix = batch['act'] r_matrix = batch['rew'] done_matrix = batch['done'] sp_matrix = batch['next_obs'] return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix def __len__(self): return self.storage.get_stored_size()
class Agent: def __init__(self, learn_rate, state_shape, num_actions, batch_size): self.mem_size=100000 self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( self.mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": { }, "next_obs": { "shape": state_shape }, "done": { "shape": 1 }}) self.net = Network(learn_rate, state_shape, num_actions) def choose_action(self, observation): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size) states = torch.tensor( batch["obs"] ).to(self.net.device) actions = torch.tensor( batch["act"], dtype=torch.int64).to(self.net.device).T[0] rewards = torch.tensor( batch["rew"] ).to(self.net.device).T[0] states_ = torch.tensor( batch["next_obs"] ).to(self.net.device) dones = torch.tensor( batch["done"], dtype=torch.bool ).to(self.net.device).T[0] batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = (td ** 2.0).mean() loss.backward() self.net.optimizer.step() self.net.reset_noise()
def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs): kwargs["n_dynamics_model"] = 5 super().__init__(*args, **kwargs) self._n_eval_episodes_per_model = n_eval_episodes_per_model # Replay buffer to train policy self.replay_buffer = get_replay_buffer(self._policy, self._env) # Replay buffer to compute GAE rb_dict = { "size": self._episode_max_steps, "default_dtype": np.float32, "env_dict": { "obs": { "shape": self._env.observation_space.shape }, "act": { "shape": self._env.action_space.shape }, "next_obs": { "shape": self._env.observation_space.shape }, "rew": {}, "done": {}, "logp": {}, "val": {} } } self.local_buffer = ReplayBuffer(**rb_dict)
def test_with_one(self): buffer_size = 32 obs_shape = 3 act_shape = 4 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape }, "act": { "shape": act_shape }, "done": {} }) v = { "obs": np.ones(shape=obs_shape), "act": np.zeros(shape=act_shape), "done": 0 } rb.add(**v) tx = rb.get_all_transitions() for key in ["obs", "act", "done"]: with self.subTest(key=key): np.testing.assert_allclose(tx[key], np.asarray(v[key]).reshape((1, -1)))
def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample( n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar(name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def test_nstep(self): rb = ReplayBuffer(32,{'rew': {}, 'done': {}}, Nstep={"size": 4, "rew": "rew"}) self.assertIs(rb.add(rew=1,done=0),None) self.assertIs(rb.add(rew=1,done=0),None) self.assertIs(rb.add(rew=1,done=0),None) self.assertEqual(rb.add(rew=1,done=0),0)
def test_multistep_add(self): rb = ReplayBuffer(4, {"done": {}}) done = jnp.asarray([1,1,1]) for i in range(2): with self.subTest(i=i): rb.add(done=done)
def test_add(self): rb = ReplayBuffer(4, {"done": {}}) done = jnp.asarray(1) for i in range(5): with self.subTest(i=i): rb.add(done=done)
def test_python_type(self): types = [bool, int, float] for d in types: with self.subTest(type=d): b = ReplayBuffer(10, {"a": {"dtype": d}}) b.add(a=d(1)) self.assertEqual(b.get_all_transitions()["a"].dtype, d)
def set_replay_buffer(self, env, get_from_file): obs_dim = env.observation_space.shape act_dim = env.action_space.shape if get_from_file: print(colorize("Pulling saved expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) f = open(self._demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size= self.replay_buffer_size, env_dict={ "obs": {"shape": obs_dim}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": obs_dim}, "done": {}}) replay_buffer.add(**before_add(obs=np_states[~np_dones], act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=np_next_states[~np_dones], done=np_next_dones[~np_dones])) self.replay_buffer = replay_buffer else: # Generate expert data print(colorize( "Generating expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) # Load trained policy _, get_action = load_policy_and_env(osp.join(self._root_data_path, self.file_name, self.file_name + '_s0/'), 'last', False) expert_rb = run_policy(env, get_action, 0, self.expert_episodes, False, record=not get_from_file, record_name='expert_' + self.file_name + '_' + str(self.expert_episodes) + '_runs', record_project='clone_benchmarking_' + self.config_name, data_path= self._expert_path, config_name= self.config_name, max_len_rb=self.replay_buffer_size) self.replay_buffer = expert_rb
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs["size"] = size # on-policy policy if not issubclass(type(policy), OffPolicyAgent): kwargs["size"] = policy.horizon kwargs["env_dict"].pop("next_obs") kwargs["env_dict"].pop("rew") # TODO: Remove done. Currently cannot remove because of cpprb implementation # kwargs["env_dict"].pop("done") kwargs["env_dict"]["logp"] = {} kwargs["env_dict"]["ret"] = {} kwargs["env_dict"]["adv"] = {} if is_discrete(env.action_space): kwargs["env_dict"]["act"]["dtype"] = np.int32 return ReplayBuffer(**kwargs) # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return PrioritizedReplayBuffer(**kwargs) if len(obs_shape) == 3: kwargs["env_dict"]["obs"]["dtype"] = np.ubyte kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return ReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) #override the observation length in the replay memory env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )} env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )} print('!!!!', env_dict['obs']) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict)
def tet_nstep_multistep_add(self): rb = ReplayBuffer(6, {"obs": {}, "rew": {}, "done": {}, "next_obs":{}}, Nstep={"size": 4, "rew": "rew", "next": "next_obs"}) obs = jnp.asarray([1,1,1,1]) rew = jnp.asarray([1,1,1,1]) done = jnp.asarray([1,1,1,1]) next_obs = jnp.asarray([1,1,1,1]) for i in range(7): with self.subTest(i=i): rb.add(obs=obs, rew=rew, done=done, next_obs=next_obs)
def set_multiple_replay_buffers(self, env): print(self.config_name_list) obs_dim = env.observation_space.shape act_dim = env.action_space.shape print(colorize("Pulling saved trajectories from two experts ( %s and %s) from files over %d episodes" % (self.config_name_list[0], self.config_name_list[1], self.expert_episodes), 'blue', bold=True)) rb_list = [] v = 0 for x in self.config_name_list: _expert_demo_dir = os.path.join(self._expert_path, x + '_episodes/') f = open(_expert_demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size=self.replay_buffer_size, env_dict={ "obs": {"shape": tuple([obs_dim[0]+2,])}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": tuple([obs_dim[0]+2,])}, "done": {}}) # Concatenate the states with one hot vectors depending on class extend1 = [one_hot(np.array([v]), self.n_experts)] * np_states[~np_dones].shape[0] appended_states = np.append(np_states[~np_dones], np.c_[extend1], 1) appended_next_states = np.append(np_next_states[~np_dones], np.c_[extend1], 1) replay_buffer.add(**before_add(obs=appended_states, act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=appended_next_states, done=np_next_dones[~np_dones])) rb_list.append(replay_buffer) v += 1 self.rb_list = rb_list
def get_replay_buffer(policy, env, size=None): if policy is None or env is None: return None kwargs = get_default_rb_dict(policy.memory_capacity, env) return ReplayBuffer(**kwargs)
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs['size'] = size # TODO(sff1019): Add on-policy behaviour # TODO(sff1019): Add N-step prioritized if len(obs_shape) == 3: kwargs['env_dict']['obs']['dtype'] = np.ubyte kwargs['env_dict']['next_obs']['dtype'] = np.ubtye if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def test_buffer(self): buffer_size = 256 obs_shape = (15,15) act_dim = 5 N = 512 erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape}, "act":{"shape": act_dim}, "rew":{}, "next_obs":{"shape": obs_shape}, "done":{}}) for i in range(N): obs = np.full(obs_shape,i,dtype=np.double) act = np.full(act_dim,i,dtype=np.double) rew = i next_obs = obs + 1 done = 0 erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done) erb._encode_sample(range(buffer_size)) erb.sample(32) erb.clear() self.assertEqual(erb.get_next_index(),0) self.assertEqual(erb.get_stored_size(),0)
def test_update_count(self): """ Check step and episode step < max_steps episode <= step """ rb = ReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) def update(kw, step, episode): self.assertLess(step, 10) self.assertLessEqual(episode, step) return 0.5 train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, update, max_steps=10)
def test_too_big_max_steps(self): """ Raise ValueError for too big max_steps """ rb = ReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) def update(kw, step, episode): raise RuntimeError with self.assertRaises(ValueError): train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, update, max_steps=int(1e+32))
def test_episode_callback(self): """ Pass custom episode_callback """ rb = ReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) def callback(episode, episode_step, episode_reward): self.assertEqual(episode_step, int(episode_reward)) train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, lambda tr, step, episode: 0.5, max_steps=10, rew_sum=lambda sum, tr: sum + 1.0, done_check=lambda tr: True)
def test_done_check(self): """ Pass custom check_done which always return `True` Always step == episode """ rb = ReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) def update(kw, step, episode): self.assertLess(step, 10) self.assertEqual(step, episode) return 0.5 train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, update, max_steps=10, done_check=lambda kw: True)
def test_warmup(self): """ Skip warmup steps n_warmups <= step """ rb = ReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) def update(kw, step, episode): self.assertGreaterEqual(step, 5) self.assertLess(step, 10) self.assertLessEqual(episode, step) return 0.5 train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, update, max_steps=10, n_warmups=5)
def test_dtype_check(self): types = [ np.bool_, np.bool8, np.byte, np.short, np.intc, np.int_, np.longlong, np.intp, np.int8, np.int16, np.int32, np.int64, np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong, np.uintp, np.uint8, np.uint16, np.uint32, np.uint64, np.half, np.single, np.double, np.float_, np.longfloat, np.float16, np.float32, np.float64, np.csingle, np.complex_, np.clongfloat, np.complex64, np.complex128 ] for d in types: with self.subTest(type=d): b = ReplayBuffer(10, {"a": {"dtype": d}}) b.add(a=np.ones(1, dtype=d)) self.assertEqual(b.get_all_transitions()["a"].dtype, d)
def __init__(self, learn_rate, state_shape, num_actions, batch_size): self.mem_size=100000 self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( self.mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": { }, "next_obs": { "shape": state_shape }, "done": { "shape": 1 }}) self.net = Network(learn_rate, state_shape, num_actions)
def test_ReplayBuffer_with_single_step(self): buffer_size = 256 obs_shape = (3, 4) batch_size = 10 rb = ReplayBuffer(buffer_size, {"obs": {"shape": obs_shape}}) v = {"obs": np.ones(shape=obs_shape)} rb.add(**v) rb.sample(batch_size) for _ in range(100): rb.add(**v) rb.sample(batch_size)
def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=1000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.target_update_interval = 200 self.step_count = 0 self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( max_mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": {}, "next_obs": { "shape": state_shape }, "done": { "shape": 1 } }) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # self.device = torch.device("cpu") self.V_MIN, self.V_MAX = 0, 200 self.NUM_ATOMS = 4 self.support = torch.linspace(self.V_MIN, self.V_MAX, self.NUM_ATOMS).to(self.device) self.net = Network(lr, state_shape, num_actions, self.support, self.NUM_ATOMS).to(self.device) self.net_ = Network(lr, state_shape, num_actions, self.support, self.NUM_ATOMS).to(self.device) self.net_.load_state_dict(self.net.state_dict())