def set_replay_buffer(self, env, get_from_file): obs_dim = env.observation_space.shape act_dim = env.action_space.shape if get_from_file: print(colorize("Pulling saved expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) f = open(self._demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size= self.replay_buffer_size, env_dict={ "obs": {"shape": obs_dim}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": obs_dim}, "done": {}}) replay_buffer.add(**before_add(obs=np_states[~np_dones], act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=np_next_states[~np_dones], done=np_next_dones[~np_dones])) self.replay_buffer = replay_buffer else: # Generate expert data print(colorize( "Generating expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes), 'blue', bold=True)) # Load trained policy _, get_action = load_policy_and_env(osp.join(self._root_data_path, self.file_name, self.file_name + '_s0/'), 'last', False) expert_rb = run_policy(env, get_action, 0, self.expert_episodes, False, record=not get_from_file, record_name='expert_' + self.file_name + '_' + str(self.expert_episodes) + '_runs', record_project='clone_benchmarking_' + self.config_name, data_path= self._expert_path, config_name= self.config_name, max_len_rb=self.replay_buffer_size) self.replay_buffer = expert_rb
def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) #override the observation length in the replay memory env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )} env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )} print('!!!!', env_dict['obs']) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict)
def set_multiple_replay_buffers(self, env): print(self.config_name_list) obs_dim = env.observation_space.shape act_dim = env.action_space.shape print(colorize("Pulling saved trajectories from two experts ( %s and %s) from files over %d episodes" % (self.config_name_list[0], self.config_name_list[1], self.expert_episodes), 'blue', bold=True)) rb_list = [] v = 0 for x in self.config_name_list: _expert_demo_dir = os.path.join(self._expert_path, x + '_episodes/') f = open(_expert_demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() data = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data) # Create environment before_add = create_before_add_func(env) replay_buffer = ReplayBuffer(size=self.replay_buffer_size, env_dict={ "obs": {"shape": tuple([obs_dim[0]+2,])}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": tuple([obs_dim[0]+2,])}, "done": {}}) # Concatenate the states with one hot vectors depending on class extend1 = [one_hot(np.array([v]), self.n_experts)] * np_states[~np_dones].shape[0] appended_states = np.append(np_states[~np_dones], np.c_[extend1], 1) appended_next_states = np.append(np_next_states[~np_dones], np.c_[extend1], 1) replay_buffer.add(**before_add(obs=appended_states, act=np_actions[~np_dones], rew=np_rewards[~np_dones], next_obs=appended_next_states, done=np_next_dones[~np_dones])) rb_list.append(replay_buffer) v += 1 self.rb_list = rb_list
def test_add(self): env_dict = create_env_dict(self.env) before_add_func = create_before_add_func(self.env) rb = ReplayBuffer(256, env_dict) obs = self.env.reset() for i in range(100): act = self.env.action_space.sample() next_obs, rew, done, _ = self.env.step(act) rb.add(**before_add_func(obs, act, next_obs, rew, done)) if done: obs = self.env.reset() else: obs = next_obs
def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict)
# 'done': {'add_shape': array([-1, 1]), 'dtype': numpy.float32, 'shape': 1}, # 'next_obs0': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'next_obs1': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'next_obs2': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'obs0': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'obs1': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'obs2': {'add_shape': array([-1, 1]), 'dtype': numpy.int32, 'shape': 1}, # 'rew': {'add_shape': array([-1, 1]), 'dtype': numpy.float32, 'shape': 1}} rb = ReplayBuffer(256, env_dict) obs = env.reset() before_add = create_before_add_func(env) for i in range(400): act = env.action_space.sample() next_obs, rew, done, _ = env.step(act) rb.add(**before_add(obs=obs,act=act,next_obs=next_obs,rew=rew,done=done)) # Create `dict` for `ReplayBuffer.add` if done: obs = env.reset() rb.on_episode_end() else: obs = next_obs