def insert_path( self, observations, actions, rewards ): # insert a path into the replay buffer self.total_paths += 1 observations = observations[:self.max_num_steps] actions = actions[:self.max_num_steps] rewards = rewards[:self.max_num_steps] # inflate the replay buffer if not inflated if self.observations is None: self.observations = nested_apply(self.inflate_backend, observations[0]) self.actions = self.inflate_backend(actions[0]) self.rewards = self.inflate_backend(rewards[0]) self.terminals = self.inflate_backend(rewards[0]) # insert all samples into the buffer for i, (o, a, r) in enumerate(zip(observations, actions, rewards)): nested_apply(self.insert_backend, self.observations, o) self.insert_backend(self.actions, a) self.insert_backend(self.rewards, r) self.insert_backend(self.terminals, 1.0 if i < len(observations) - 1 else 0.0) # increment the head and size self.head = (self.head + 1) % self.max_num_steps self.size = min(self.size + 1, self.max_num_steps) self.total_steps += 1
def insert_path(self, observations, actions, rewards): # insert a path into the replay buffer self.total_paths += 1 observations = observations[:self.max_path_length] actions = actions[:self.max_path_length] rewards = rewards[:self.max_path_length] # inflate the replay buffer if not inflated if self.observations is None: self.observations = nested_apply(self.inflate_backend, observations[0]) self.actions = self.inflate_backend(actions[0]) self.rewards = self.inflate_backend(rewards[0]) self.terminals = np.zeros([self.max_num_paths], dtype=np.int32) # insert all samples into the buffer for i, (o, a, r) in enumerate(zip(observations, actions, rewards)): nested_apply(self.insert_backend, self.observations, o) self.insert_backend(self.actions, a) self.insert_backend(self.rewards, r) self.terminals[self.head] = i self.total_steps += 1 # increment the head and size self.head = (self.head + 1) % self.max_path_length self.size = min(self.size + 1, self.max_path_length)
def reset( self, **kwargs ): observation = ProxyEnv.reset(self, **kwargs) observation = nested_apply( normalize, observation, self.original_observation_space) observation = nested_apply( lambda x: x.astype(np.float32), observation) return observation
def step(self, action): # convert the observation space to a dict for consistency observation, reward, done, info = self.wrapped_env.step(action) if not isinstance(observation, dict): observation = {"observation": observation} observation = nested_apply(lambda x: np.array(x, dtype=np.float32), observation) observation = nested_apply(lambda x: np.array(x, dtype=np.float32), observation) reward = self.reward_shift + self.reward_scale * np.array( reward, dtype=np.float32) return observation, reward, done, info
def step( self, action ): if not isinstance(self.original_action_space, Discrete): action = denormalize( action, self.original_action_space) observation, reward, done, info = ProxyEnv.step(self, action) observation = nested_apply( normalize, observation, self.original_observation_space) observation = nested_apply( lambda x: x.astype(np.float32), observation) return observation, reward, done, info
def reset(self, **kwargs): # convert the observation space to a dict for consistency observation = nested_apply(lambda x: np.array(x, dtype=np.float32), self.wrapped_env.reset(**kwargs)) if not isinstance(observation, dict): observation = {"observation": observation} return observation
def sample( self, batch_size ): # determine which steps to sample from idx = np.random.choice(self.size, size=batch_size, replace=(self.size < batch_size)) next_idx = (idx + 1) % self.max_num_steps def sample(data): return data[idx, ...] def sample_next(data): return data[next_idx, ...] # sample current batch from a nested samplers structure observations = nested_apply(sample, self.selector(self.observations)) actions = sample(self.actions) rewards = sample(self.rewards) next_observations = nested_apply(sample_next, self.selector(self.observations)) terminals = sample(self.terminals) # return the samples in a batch return observations, actions, rewards, next_observations, terminals
def __init__( self, *args, **kwargs ): # normalize the action and observation space to -1 and 1 ProxyEnv.__init__(self, *args, **kwargs) self.original_observation_space = self.observation_space.spaces self.original_action_space = self.action_space self.observation_space = Dict( nested_apply( create_space, self.original_observation_space)) if not isinstance(self.original_action_space, Discrete): self.action_space = create_space(self.original_action_space) else: self.action_space = self.original_action_space
def sample(self, batch_size): # determine which steps to sample from idx = np.random.choice(self.size, size=batch_size, replace=(self.size < batch_size)) def sample(data): return data[idx, ...] # sample current batch from a nested samplers structure observations = nested_apply(sample, self.selector(self.observations)) actions = sample(self.actions) rewards = sample(self.rewards) terminals = (np.arange(self.max_path_length)[None, :] <= sample( self.terminals)[:, None]).astype(np.float32) # return the samples in a batch return observations, actions, rewards, terminals