Exemple #1
0
    def set_replay_buffer(self, env, get_from_file):

        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape

        if get_from_file:
            print(colorize("Pulling saved expert %s trajectories from file over %d episodes" %
                           (self.config_name, self.expert_episodes), 'blue', bold=True))

            f = open(self._demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb")
            buffer_file = pickle.load(f)
            f.close()

            data = samples_from_cpprb(npsamples=buffer_file)

            # Reconstruct the data, then pass it to replay buffer
            np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data)

            # Create environment
            before_add = create_before_add_func(env)

            replay_buffer = ReplayBuffer(size= self.replay_buffer_size,
                                         env_dict={
                                             "obs": {"shape": obs_dim},
                                             "act": {"shape": act_dim},
                                             "rew": {},
                                             "next_obs": {"shape": obs_dim},
                                             "done": {}})

            replay_buffer.add(**before_add(obs=np_states[~np_dones],
                                           act=np_actions[~np_dones],
                                           rew=np_rewards[~np_dones],
                                           next_obs=np_next_states[~np_dones],
                                           done=np_next_dones[~np_dones]))
            self.replay_buffer = replay_buffer

        else:
            # Generate expert data
            print(colorize(
                "Generating expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes),
                'blue', bold=True))

            # Load trained policy
            _, get_action = load_policy_and_env(osp.join(self._root_data_path, self.file_name, self.file_name + '_s0/'),
                                                'last', False)
            expert_rb = run_policy(env,
                                   get_action,
                                   0,
                                   self.expert_episodes,
                                   False,
                                   record=not get_from_file,
                                   record_name='expert_' + self.file_name + '_' + str(self.expert_episodes) + '_runs',
                                   record_project='clone_benchmarking_' + self.config_name,
                                   data_path= self._expert_path,
                                   config_name= self.config_name,
                                   max_len_rb=self.replay_buffer_size)

            self.replay_buffer = expert_rb
    def __init__(self, max_length, seed_number, env):
        env_dict = create_env_dict(env)

        #override the observation length in the replay memory
        env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )}
        env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )}
        print('!!!!', env_dict['obs'])
        self.before_add = create_before_add_func(env)
        self.storage = ReplayBuffer(max_length, env_dict)
Exemple #3
0
    def set_multiple_replay_buffers(self, env):
        print(self.config_name_list)

        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape

        print(colorize("Pulling saved trajectories from two experts ( %s and %s) from files over %d episodes" %
                       (self.config_name_list[0], self.config_name_list[1], self.expert_episodes), 'blue', bold=True))

        rb_list = []

        v = 0
        for x in self.config_name_list:

            _expert_demo_dir = os.path.join(self._expert_path, x + '_episodes/')

            f = open(_expert_demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb")
            buffer_file = pickle.load(f)
            f.close()

            data = samples_from_cpprb(npsamples=buffer_file)

            # Reconstruct the data, then pass it to replay buffer
            np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data)

            # Create environment
            before_add = create_before_add_func(env)

            replay_buffer = ReplayBuffer(size=self.replay_buffer_size,
                                         env_dict={
                                             "obs": {"shape": tuple([obs_dim[0]+2,])},
                                             "act": {"shape": act_dim},
                                             "rew": {},
                                             "next_obs": {"shape": tuple([obs_dim[0]+2,])},
                                             "done": {}})



            # Concatenate the states with one hot vectors depending on class
            extend1 = [one_hot(np.array([v]), self.n_experts)] * np_states[~np_dones].shape[0]

            appended_states = np.append(np_states[~np_dones], np.c_[extend1], 1)
            appended_next_states = np.append(np_next_states[~np_dones], np.c_[extend1], 1)

            replay_buffer.add(**before_add(obs=appended_states,
                                           act=np_actions[~np_dones],
                                           rew=np_rewards[~np_dones],
                                           next_obs=appended_next_states,
                                           done=np_next_dones[~np_dones]))

            rb_list.append(replay_buffer)
            v += 1
        self.rb_list = rb_list
Exemple #4
0
    def test_add(self):
        env_dict = create_env_dict(self.env)
        before_add_func = create_before_add_func(self.env)

        rb = ReplayBuffer(256, env_dict)

        obs = self.env.reset()

        for i in range(100):
            act = self.env.action_space.sample()
            next_obs, rew, done, _ = self.env.step(act)

            rb.add(**before_add_func(obs, act, next_obs, rew, done))

            if done:
                obs = self.env.reset()
            else:
                obs = next_obs
Exemple #5
0
 def __init__(self, max_length, seed_number, env):
     env_dict = create_env_dict(env)
     self.before_add = create_before_add_func(env)
     self.storage = ReplayBuffer(max_length, env_dict)
# 'done': {'add_shape': array([-1,  1]), 'dtype': numpy.float32, 'shape': 1},
# 'next_obs0': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'next_obs1': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'next_obs2': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'obs0': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'obs1': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'obs2': {'add_shape': array([-1,  1]), 'dtype': numpy.int32, 'shape': 1},
# 'rew': {'add_shape': array([-1,  1]), 'dtype': numpy.float32, 'shape': 1}}



rb = ReplayBuffer(256, env_dict)


obs = env.reset()
before_add = create_before_add_func(env)

for i in range(400):
    act = env.action_space.sample()
    next_obs, rew, done, _ = env.step(act)

    rb.add(**before_add(obs=obs,act=act,next_obs=next_obs,rew=rew,done=done))
    # Create `dict` for `ReplayBuffer.add`

    if done:
        obs = env.reset()
        rb.on_episode_end()
    else:
        obs = next_obs