Esempio n. 1
0
    def insert_path(
            self,
            observations,
            actions,
            rewards
    ):
        # insert a path into the replay buffer
        self.total_paths += 1
        observations = observations[:self.max_num_steps]
        actions = actions[:self.max_num_steps]
        rewards = rewards[:self.max_num_steps]

        # inflate the replay buffer if not inflated
        if self.observations is None:
            self.observations = nested_apply(self.inflate_backend, observations[0])
            self.actions = self.inflate_backend(actions[0])
            self.rewards = self.inflate_backend(rewards[0])
            self.terminals = self.inflate_backend(rewards[0])

        # insert all samples into the buffer
        for i, (o, a, r) in enumerate(zip(observations, actions, rewards)):
            nested_apply(self.insert_backend, self.observations, o)
            self.insert_backend(self.actions, a)
            self.insert_backend(self.rewards, r)
            self.insert_backend(self.terminals, 1.0 if i < len(observations) - 1 else 0.0)

            # increment the head and size
            self.head = (self.head + 1) % self.max_num_steps
            self.size = min(self.size + 1, self.max_num_steps)
            self.total_steps += 1
Esempio n. 2
0
    def insert_path(self, observations, actions, rewards):
        # insert a path into the replay buffer
        self.total_paths += 1
        observations = observations[:self.max_path_length]
        actions = actions[:self.max_path_length]
        rewards = rewards[:self.max_path_length]

        # inflate the replay buffer if not inflated
        if self.observations is None:
            self.observations = nested_apply(self.inflate_backend,
                                             observations[0])
            self.actions = self.inflate_backend(actions[0])
            self.rewards = self.inflate_backend(rewards[0])
            self.terminals = np.zeros([self.max_num_paths], dtype=np.int32)

        # insert all samples into the buffer
        for i, (o, a, r) in enumerate(zip(observations, actions, rewards)):
            nested_apply(self.insert_backend, self.observations, o)
            self.insert_backend(self.actions, a)
            self.insert_backend(self.rewards, r)
            self.terminals[self.head] = i
            self.total_steps += 1

        # increment the head and size
        self.head = (self.head + 1) % self.max_path_length
        self.size = min(self.size + 1, self.max_path_length)
Esempio n. 3
0
 def reset(
     self,
     **kwargs
 ):
     observation = ProxyEnv.reset(self, **kwargs)
     observation = nested_apply(
         normalize, observation, self.original_observation_space)
     observation = nested_apply(
         lambda x: x.astype(np.float32), observation)
     return observation
Esempio n. 4
0
 def step(self, action):
     # convert the observation space to a dict for consistency
     observation, reward, done, info = self.wrapped_env.step(action)
     if not isinstance(observation, dict):
         observation = {"observation": observation}
     observation = nested_apply(lambda x: np.array(x, dtype=np.float32),
                                observation)
     observation = nested_apply(lambda x: np.array(x, dtype=np.float32),
                                observation)
     reward = self.reward_shift + self.reward_scale * np.array(
         reward, dtype=np.float32)
     return observation, reward, done, info
Esempio n. 5
0
 def step(
     self, 
     action
 ):
     if not isinstance(self.original_action_space, Discrete):
         action = denormalize(
             action, self.original_action_space)
     observation, reward, done, info = ProxyEnv.step(self, action)
     observation = nested_apply(
         normalize, observation, self.original_observation_space)
     observation = nested_apply(
         lambda x: x.astype(np.float32), observation)
     return observation, reward, done, info
Esempio n. 6
0
 def reset(self, **kwargs):
     # convert the observation space to a dict for consistency
     observation = nested_apply(lambda x: np.array(x, dtype=np.float32),
                                self.wrapped_env.reset(**kwargs))
     if not isinstance(observation, dict):
         observation = {"observation": observation}
     return observation
Esempio n. 7
0
    def sample(
            self,
            batch_size
    ):
        # determine which steps to sample from
        idx = np.random.choice(self.size, size=batch_size, replace=(self.size < batch_size))
        next_idx = (idx + 1) % self.max_num_steps

        def sample(data):
            return data[idx, ...]

        def sample_next(data):
            return data[next_idx, ...]

        # sample current batch from a nested samplers structure
        observations = nested_apply(sample, self.selector(self.observations))
        actions = sample(self.actions)
        rewards = sample(self.rewards)
        next_observations = nested_apply(sample_next, self.selector(self.observations))
        terminals = sample(self.terminals)

        # return the samples in a batch
        return observations, actions, rewards, next_observations, terminals
Esempio n. 8
0
    def __init__(
        self, 
        *args,
        **kwargs
    ):
        # normalize the action and observation space to -1 and 1
        ProxyEnv.__init__(self, *args, **kwargs)
        self.original_observation_space = self.observation_space.spaces
        self.original_action_space = self.action_space
        self.observation_space = Dict(
            nested_apply(
                create_space, self.original_observation_space))

        if not isinstance(self.original_action_space, Discrete):
            self.action_space = create_space(self.original_action_space)
        else:
            self.action_space = self.original_action_space
Esempio n. 9
0
    def sample(self, batch_size):
        # determine which steps to sample from
        idx = np.random.choice(self.size,
                               size=batch_size,
                               replace=(self.size < batch_size))

        def sample(data):
            return data[idx, ...]

        # sample current batch from a nested samplers structure
        observations = nested_apply(sample, self.selector(self.observations))
        actions = sample(self.actions)
        rewards = sample(self.rewards)
        terminals = (np.arange(self.max_path_length)[None, :] <= sample(
            self.terminals)[:, None]).astype(np.float32)

        # return the samples in a batch
        return observations, actions, rewards, terminals