Beispiel #1
0
 def reset(
     self,
     **kwargs
 ):
     observation = ProxyEnv.reset(self, **kwargs)
     observation = nested_apply(
         normalize, observation, self.original_observation_space)
     observation = nested_apply(
         lambda x: x.astype(np.float32), observation)
     return observation
Beispiel #2
0
 def step(
     self, 
     action
 ):
     if not isinstance(self.original_action_space, Discrete):
         action = denormalize(
             action, self.original_action_space)
     observation, reward, done, info = ProxyEnv.step(self, action)
     observation = nested_apply(
         normalize, observation, self.original_observation_space)
     observation = nested_apply(
         lambda x: x.astype(np.float32), observation)
     return observation, reward, done, info
Beispiel #3
0
 def reset(self, **kwargs):
     # convert the observation space to a dict for consistency
     observation = nested_apply(lambda x: np.array(x, dtype=np.float32),
                                self.wrapped_env.reset(**kwargs))
     if not isinstance(observation, dict):
         observation = {"observation": observation}
     return observation
Beispiel #4
0
 def step(self, action):
     # convert the observation space to a dict for consistency
     observation, reward, done, info = self.wrapped_env.step(action)
     if not isinstance(observation, dict):
         observation = {"observation": observation}
     observation = nested_apply(lambda x: np.array(x, dtype=np.float32),
                                observation)
     reward = self.reward_shift + self.reward_scale * np.array(
         reward, dtype=np.float32)
     return observation, reward, done, info
    def insert_path(self, observations, actions, rewards):
        # insert a path into the replay buffer
        self.total_paths += 1
        observations = observations[:self.max_num_steps]
        actions = actions[:self.max_num_steps]
        rewards = rewards[:self.max_num_steps]

        # inflate the replay buffer if not inflated
        if any([
                self.observations is None, self.actions is None,
                self.rewards is None, self.terminals is None
        ]):
            self.observations = nested_apply(self.inflate_backend,
                                             observations[0])
            self.actions = nested_apply(self.inflate_backend, actions[0])
            self.rewards = self.inflate_backend(np.squeeze(rewards[0]))
            self.terminals = self.inflate_backend(np.array([0, 0]))

        # insert all samples into the buffer
        for time_step, (o, a,
                        r) in enumerate(zip(observations, actions, rewards)):
            nested_apply(self.insert_backend, self.observations, o)
            nested_apply(self.insert_backend, self.actions, a)
            self.insert_backend(self.rewards, np.squeeze(r))
            self.insert_backend(self.terminals,
                                np.array([time_step, self.total_paths]))

            # increment the head and size
            self.head = (self.head + 1) % self.max_num_steps
            self.size = min(self.size + 1, self.max_num_steps)
            self.total_steps += 1
Beispiel #6
0
    def __init__(
        self, 
        *args,
        **kwargs
    ):
        # normalize the action and observation space to -1 and 1
        ProxyEnv.__init__(self, *args, **kwargs)
        self.original_observation_space = self.observation_space.spaces
        self.original_action_space = self.action_space
        self.observation_space = Dict(
            nested_apply(
                create_space, self.original_observation_space))

        if not isinstance(self.original_action_space, Discrete):
            self.action_space = create_space(self.original_action_space)
        else:
            self.action_space = self.original_action_space
    def sample(self,
               batch_size,
               time_skip=1,
               goal_skip=1,
               hierarchy_selector=(lambda x: x)):
        # handle cases when we want to sample everything
        batch_size = batch_size if batch_size > 0 else self.size

        # sample transition for a hierarchy of policies
        idx = np.random.choice(self.size,
                               size=batch_size,
                               replace=(self.size < batch_size))

        # force the samples to occur every time_skip
        idx = idx - self.terminals[idx, 0].astype(np.int32) % time_skip
        next_idx = np.minimum(idx + time_skip, self.max_num_steps)

        def sample_observations(data):
            return data[idx, ...]

        def sample_observations_last(data):
            return data[next_idx, ...]

        # sample current batch from a nested structure
        observations = nested_apply(sample_observations, self.observations)
        observations["goal"] = hierarchy_selector(observations["goal"])
        actions = hierarchy_selector(
            nested_apply(sample_observations, self.actions))

        # sum the rewards across the horizon where valid
        rewards = 0.0
        for j in [(idx + i) % self.max_num_steps for i in range(time_skip)]:
            rewards = rewards + (self.rewards[j, ...] * np.equal(
                self.terminals[j, 1], self.terminals[idx, 1]).astype(
                    np.float32))

        # sample current batch from a nested structure
        next_observations = nested_apply(sample_observations_last,
                                         self.observations)
        next_observations["goal"] = hierarchy_selector(
            next_observations["goal"])
        terminals = np.ones([batch_size])

        # force the achieved goals to occur every goal_skip
        goal_idx = np.minimum(
            idx - self.terminals[idx, 0].astype(np.int32) % goal_skip +
            goal_skip, self.max_num_steps)
        next_goal_idx = np.minimum(
            next_idx -
            self.terminals[next_idx, 0].astype(np.int32) % goal_skip +
            goal_skip, self.max_num_steps)

        # sample observation goals achieved by the agent
        def sample_goals(data):
            return data[goal_idx, ...]

        # sample observation goals achieved by the agent
        def sample_goals_last(data):
            return data[next_goal_idx, ...]

        # sample current batch from a nested structure
        achieved_goals = nested_apply(sample_goals, self.observations)
        observations["achieved_goal"] = achieved_goals

        # sample current batch from a nested structure
        achieved_next_goals = nested_apply(sample_goals_last,
                                           self.observations)
        next_observations["achieved_goal"] = achieved_next_goals

        # return the samples in a batch
        return observations, actions, rewards, next_observations, terminals