Ejemplo n.º 1
0
def generate_transitions(policy, env, num_timesteps_total,
                         max_steps_per_episode, save_path):
    buff = SimpleReplayBuffer(num_timesteps_total,
                              env.observation_space.shape,
                              gym_get_dim(env.action_space),
                              discrete_action_dim=True)

    cur_total = 0
    steps_left_in_episode = 0
    while cur_total != num_timesteps_total:
        if steps_left_in_episode == 0:
            steps_left_in_episode = max_steps_per_episode
            obs = env.reset()

        act = policy.get_action(obs)
        next_obs, rew, done, _ = env.step(act)
        buff.add_sample(obs, act, rew, done, next_obs)

        obs = next_obs
        cur_total += 1
        steps_left_in_episode -= 1

    save_dict = dict(
        observations=buff._observations,
        actions=buff._actions,
        rewards=buff._rewards,
        terminals=buff._terminals,
        next_observations=buff._next_obs,
    )
    joblib.dump(save_dict, save_path)

    # debug
    from scipy.misc import imsave
    actions = buff._actions
    observations = buff._observations
    for i in range(1000):
        a = actions[i]
        obs = observations[i]
        print(a)
        imsave('junk_vis/tiny/mem_grid_{}.png'.format(i),
               np.transpose(obs, (1, 2, 0)))
Ejemplo n.º 2
0
 def test_num_steps_can_sample(self):
     buffer = SimpleReplayBuffer(10000, 1, 1)
     buffer.add_sample(1, 1, 1, False, 1)
     buffer.add_sample(1, 1, 1, True, 1)
     buffer.terminate_episode()
     buffer.add_sample(1, 1, 1, False, 1)
     self.assertEqual(buffer.num_steps_can_sample(), 3)
Ejemplo n.º 3
0
    obs = d['obs'][path_num]
    acs = d['acs'][path_num]
    env_infos = d['info'][path_num]

    ep_len = len(obs)
    for j in range(ep_len - 1):
        o = {
            'obs': obs[j]['observation'],
            'obs_task_params': obs[j]['desired_goal']
        }
        a = acs[j]
        r = 0.  # the demons don't come with reward
        terminal = 0  # none of the robotic environments in gym have terminal 1 ever
        next_o = {
            'obs': obs[j + 1]['observation'],
            'obs_task_params': obs[j + 1]['desired_goal']
        }
        env_info = env_infos[j]
        buffer.add_sample(o,
                          a,
                          r,
                          terminal,
                          next_o,
                          agent_info={},
                          env_info=env_info)
    buffer.terminate_episode()

# save it
file_name = os.path.join(rlkit_buffer_save_dir, 'extra_data.pkl')
joblib.dump({'replay_buffer': buffer}, file_name, compress=3)