class PriorityReplayMemory(ReplayMemory): def __init__(self, size, alpha): super().__init__(size) self.alpha = alpha tree_cap = 1 while tree_cap < size: tree_cap *= 2 self.sum_tree = SumSegmentTree(tree_cap) self.max_priority = 1.0 def add_mem(self, *args, **kwargs): idx = self.next_idx super().add_mem(*args, **kwargs) prio = self.max_priority**self.alpha self.sum_tree[idx] = prio def _sample_idxs(self, size): idxs = [] for _ in range(size): mass = random.random() * self.sum_tree.sum(0, len(self.memory) - 1) idx = self.sum_tree.find_prefixsum_idx(mass) idxs.append(idx) return idxs def get_batch(self, size, beta): assert beta > 0 idxs = self._sample_idxs(size) # create weights: weights = [] tot_sum = self.sum_tree.sum() #p_min = tot_min / tot_sum #max_w = (p_min * len(self.memory)) ** (-beta) for idx in idxs: p_sample = self.sum_tree[idx] / tot_sum weight = (p_sample * len(self.memory))**(-beta) weights.append(weight) #print(max_w) #print(weights) weights = np.array(weights) weights /= max(weights) batch = [self.memory[idx] for idx in idxs] return batch, weights, idxs def update_priorities(self, idxs, priorities): for idx, prio in zip(idxs, priorities): assert prio > 0 prio_a = prio**self.alpha self.sum_tree[idx] = prio_a self.max_priority = max(self.max_priority, prio)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, memory_size=1000000, alpha=0.5, seed=None): ''' Prioritized replay buffer from https://arxiv.org/pdf/1511.05952.pdf This implementation is based on the OpenAI sumtree implemenation which can be found here https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py memory_size: int maximum number of experiences to store alpha: float, [0.0, 1.0] hyperparameter that controls the amount of prioritization, with 0.0 being no prioritization (the uniform case) seed: None or int random seed for the replay buffer ''' super().__init__(memory_size=memory_size, seed=seed) self.alpha = alpha it_capacity = 1 while it_capacity < self._memory_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, experience): ''' Add an experience to the replay buffer experience: object, usually a tuple the experience to store in the replay buffer this implementation does not specify a form for the experience as all that is handled by the DQN agent ''' index = self._next_index super().add(experience) self._it_sum[index] = self._max_priority**self.alpha self._it_min[index] = self._max_priority**self.alpha def _sample_proportional(self, batch_size): ''' Function to use sample from the replay buffer with proportional prioritization All code here is from the OpenAI implementation to correctly make use of the sum-tree data structure batch_size: int then number of experience to sample res: list list of indices of the experiences sampled from the replay buffer ''' res = [] p_total = self._it_sum.sum(0, len(self._memory) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len index = self._it_sum.find_prefixsum_idx(mass) res.append(index) return res def sample(self, batch_size, beta=1.0): ''' Sample from the replay buffer with proportional prioritization batch_size: int then number of experience to sample samples: list of 3-tuples list of sampled experiences, importance sampling weights for each experience, and the indices of the experiences (used to update priorities) in the form (experience, is_weights, indices) ''' indices = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._memory))**(-beta) samples = [] for i in indices: p_sample = self._it_sum[i] / self._it_sum.sum() is_weight = ((p_sample * len(self._memory))**(-beta)) / max_weight experience = self._memory[i] sample = (experience, is_weight, i) samples.append(sample) return samples def update_priorities(self, indices, priorities): ''' Update the priorities for the experiences corresponding to the given indices indices: list-like list of indices for the experiences/priorities to update priorities: list-like list of new priorities corresponding to the given indices ''' for i, priority in zip(indices, priorities): self._it_sum[i] = priority**self.alpha self._it_min[i] = priority**self.alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """ Prioritied Experience Replay """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha # I don't understand purpose of this # maybe to create a graph to store ranked truples? it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): idx = self._idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._buffer) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._buffer))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._buffer))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """ set priority of transition at index idxes[i] in buffer to priorities[i] """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._buffer) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class ProportionalReplay(ExperienceReplay): def __init__(self, size, alpha): super(ProportionalReplay, self).__init__(size) assert alpha >= 0 self.alpha = alpha self.tree_size = 1 while self.tree_size < self.maxsize: self.tree_size *= 2 self.min_tree = MinSegmentTree( self.tree_size) # for calculating maximum IS weight self.sum_tree = SumSegmentTree( self.tree_size) # for proportional sampling self.max_priority = 1.0 # maximum priority we've seen so far. will be updated def add(self, experience): idx = self.next_idx # save idx before it's changed in super call super().add( experience) # put experience data (s,a,r,s',done) in buffer # give new experience max priority to ensure it's replayed at least once self.min_tree[idx] = self.max_priority**self.alpha self.sum_tree[idx] = self.max_priority**self.alpha # To sample a minibatch of size k, the range [0, p_total] is divided equally into k ranges. # Next, a value is uniformly sampled from each range. def sample_proportional(self, batch_size): idxs = [] p_total = self.sum_tree.sum( 0, len(self.buffer) - 1) # sum of the priorities of all experience in the buffer every_range_len = p_total / batch_size # length of every range over [0,p_total] (batch_size = k) for i in range(batch_size): # for each range mass = self.np_random.uniform( ) * every_range_len + i * every_range_len # uniformly sampling a probability mass from this range idx = self.sum_tree.find_prefixsum_idx( mass ) # get smallest experience index s.t. cumulative dist F(idx) >= mass idxs.append(idx) return idxs # sample batch of experiences along with their weights and indices def sample(self, batch_size, beta): assert beta > 0 idxs = self.sample_proportional( batch_size) # sampled experience indices weights = [] p_min = self.min_tree.min() / self.sum_tree.sum( ) # minimum possible priority for a transition max_weight = (p_min * len(self.buffer))**( -beta) # (p_uniform/p_min)^beta is maximum possible IS weight # get IS weights for sampled experience for idx in idxs: p_sample = self.sum_tree[idx] / self.sum_tree.sum( ) # normalize sampled priority weight = (p_sample * len(self.buffer))**( -beta) # (p_uniform/p_sample)^beta. IS weight weights.append( weight / max_weight ) # weights normalized by max so that they only scale the update downwards weights = np.array(weights) encoded_sample = self.encode_samples( idxs) # collect experience at given indices return tuple(list(encoded_sample) + [weights, idxs]) # set the priorities of experiences at given indices def update_priorities(self, idxs, priorities): assert len(idxs) == len(priorities) for idx, priority in zip(idxs, priorities): assert priority > 0 assert 0 <= idx < len(self.buffer) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): """Fixed-size prioritized buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"): """Initialize a PrioritizedReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization) beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction) """ super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device) self.alpha = alpha self.beta = beta self._eps = 0.00000001 it_capacity = 1 while it_capacity < buffer_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" idx = self._next_idx super().add(state, action, reward, next_state, done) self._it_sum[idx] = self._max_priority ** self.alpha self._it_min[idx] = self._max_priority ** self.alpha def _sample_proportional(self): res = [] p_total = self._it_sum.sum(0, len(self.memory) - 1) every_range_len = p_total / self.batch_size for i in range(self.batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self): idxes = self._sample_proportional() weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self.memory) + self._eps) ** (-self.beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self.memory) + self._eps) ** (-self.beta) weights.append(weight / max_weight) weights = torch.tensor(weights, device=self.device, dtype=torch.float) states = torch.from_numpy(np.vstack([self.memory[i].state for i in idxes])).float().to(self.device) actions = torch.from_numpy(np.vstack([self.memory[i].action for i in idxes])).long().to(self.device) rewards = torch.from_numpy(np.vstack([self.memory[i].reward for i in idxes])).float().to(self.device) next_states = torch.from_numpy(np.vstack([self.memory[i].next_state for i in idxes])).float().to(self.device) dones = torch.from_numpy(np.vstack([self.memory[i].done for i in idxes]).astype(np.uint8)).float().to(self.device) return (states, actions, rewards, next_states, dones, idxes, weights) def update_priorities(self, indexes, priorities): """Update priorities of sampled transitions. sets priority of transition at index indexes[i] in buffer to priorities[i]. Parameters ---------- indexes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ for idx, priority in zip(indexes, priorities): self._it_sum[idx] = priority ** self.alpha self._it_min[idx] = priority ** self.alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, state_shape, alpha, n_batch_trajectories, n_trajectory_steps, n_emus=1): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. dsize: int Max number of demonstration transitions. These are retained in the buffer permanently. https://arxiv.org/abs/1704.03732 alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size, state_shape, n_batch_trajectories, n_trajectory_steps, n_emus=n_emus) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < self._size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.add_effect""" idx = super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, self._size - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return np.array(res) def _compute_weights(self, idxes, beta): weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * self._size)**(-beta) for idx in idxes: if idx < 0: weights.append(0.0) continue p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * self._size)**(-beta) weights.append(weight / max_weight) return np.array(weights) def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) batch_samples = self._retrieve_samples(idxes) weights = self._compute_weights(idxes, beta) return tuple(list(batch_samples) + [weights, idxes]) def sample_nstep(self, beta): """Sample a (self.n_batch_trajectories, self.n_trajectory_steps, n_s) batch of states, where n_s is the dimension of the state vector Compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) n_step: int How many steps to look into the future Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. n_step_rewards_batch: np.array n-step rewards vector batch tpn_obs_batch: np.array tpn set of observations n_tpn_step_batch: np.array n in n-step indicator to indicate if trajectory sampled is unfinished or done -- trajectory is unfinished if there are no more transitions to cover all n steps n_step_done_mask: np.array n_step_done_mask[i] = 1 if trajectory sampled reaches the end of an episode, and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 traj_idxes = self._sample_proportional(self.n_batch_trajectories) batched_trajectories, idxes = self._retrieve_n_step_trajectories( traj_idxes) #for i in idxes: # if i == -1: continue # assert self.traj_ids[i] > -1 weights = self._compute_weights(idxes, beta) return batched_trajectories + (weights, idxes) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): if idx < 0: continue assert priority > 0 assert 0 <= idx < self._size self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PriorityBuffer(BufferBase): def __init__(self, capacity, gamma=0.99, n_steps=2, alpha=0.5): super(PriorityBuffer, self).__init__(capacity, gamma, n_steps) self.buffer = [] self.position = 0 self.alpha = alpha it_cap = 1 while it_cap < capacity: it_cap *= 2 self._it_sum = SumSegmentTree(it_cap) self._it_min = MinSegmentTree(it_cap) self._max_priority = 1.0 def __len__(self): return len(self.buffer) def append(self, experience): position = self.position if len(self.buffer) < self.capacity: self.buffer.append(experience) else: self.buffer[position] = experience self.position = (position + 1) % self.capacity self._it_sum[position] = self._max_priority**self.alpha self._it_min[position] = self._max_priority**self.alpha def _sample_proportional(self, batch_size): total = self._it_sum.sum(0, len(self.buffer) - (1 + self.n_steps)) mass = np.random.random(size=batch_size) * total idx = self._it_sum.find_prefix_sum_idx(mass) return idx def sample(self, batch_size, beta=0.4): assert beta > 0 indices = self._sample_proportional(batch_size) states = [] actions = [] rewards = [] dones = [] next_states = [] for index in indices: current_buffer = self.buffer[index] current_state = current_buffer[0] current_action = current_buffer[1] current_done = current_buffer[3] reward = current_buffer[2] next_state = self.buffer[index + self.n_steps][0] for sub_index in range(1, self.n_steps): reward += self.buffer[index + sub_index][2] * (self.gamma** sub_index) if self.buffer[index + sub_index][3]: break states.append(ar(current_state, dtype=np.float32)) actions.append(current_action) rewards.append(reward) dones.append(current_done) next_states.append(ar(next_state, dtype=np.float32)) sm = self._it_sum.sum() p_min = self._it_min.min() / sm max_weight = (p_min * len(self.buffer))**(-beta) p_sample = self._it_sum[indices] / sm weights = (p_sample * len(self.buffer))**(-beta) / max_weight states_np = np.stack(states, 0) / 255.0 next_states_np = np.stack(next_states, 0) / 255.0 return states_np, ar(actions), ar(rewards, dtype=np.float32), ar(dones, dtype=np.uint8), next_states_np, \ ar(weights, dtype=np.float32), indices def update_weights(self, batch_indices, batch_priorities): assert len(batch_indices) == len(batch_priorities) assert np.min(batch_priorities) > 0 assert np.min(batch_indices) >= 0 for idx, prio in zip(batch_indices, batch_priorities): idx = int(idx) self._it_sum[idx] = prio**self.alpha self._it_min[idx] = prio**self.alpha self._max_priority = max(self._max_priority, np.max(batch_priorities))
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6): super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed) #capacity must be positive and a power of 2 tree_capacity = 1 while tree_capacity < self.buffer_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha def add(self, state, action, reward, next_state, done): self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha self.min_tree[self.tree_ptr] = self.max_priority**self.alpha super().add(state, action, reward, next_state, done) self.tree_ptr = (self.tree_ptr + 1) % self.buffer_size # if self.tree_ptr == self.buffer_size-1: # for i in range(0, self.buffer_size-1): # self.sum_tree[i] = self.sum_tree[i+1] # self.min_tree[i] = self.min_tree[i+1] # self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha # self.min_tree[self.tree_ptr] = self.max_priority**self.alpha # else: # def sample(self, beta=0.4): indices = self._sample_proportional() indices = [index for index in indices if index<len(self.memory)] states = torch.from_numpy(np.vstack([self.memory[index].state for index in indices])).float().to(device) actions = torch.from_numpy(np.vstack([self.memory[index].action for index in indices])).long().to(device) rewards = torch.from_numpy(np.vstack([self.memory[index].reward for index in indices])).float().to(device) next_states = torch.from_numpy(np.vstack([self.memory[index].next_state for index in indices])).float().to(device) dones = torch.from_numpy(np.vstack([self.memory[index].done for index in indices]).astype(np.uint8)).float().to(device) weights = torch.from_numpy(np.vstack([self._cal_weight(index, beta) for index in indices])).float().to(device) return (states, actions, rewards, next_states, dones, weights, indices) def update_priority(self, indices, loss_for_prior): for idx, priority in zip(indices, loss_for_prior): self.sum_tree[idx] = priority ** self.alpha self.min_tree[idx] = priority ** self.alpha self.max_priority = max(self.max_priority, priority) def _sample_proportional(self): indices = [] p_total = self.sum_tree.sum() #sum(0, len(self.memory)-1) segment = p_total / self.batch_size for i in range(self.batch_size): start = segment * i end = start + segment upper = random.uniform(start, end) index = self.sum_tree.retrieve(upper) indices.append(index) return indices def _cal_weight(self, index, beta): sum_priority = self.sum_tree.sum() min_priority = self.min_tree.min() current_priority = self.sum_tree[index] # max_w = (len(self.memory) * (min_priority/sum_priority)) ** (-beta) # current_w = (len(self.memory) * (current_priority/sum_priority)) ** (-beta) # return current_w / max_w return (min_priority / current_priority) ** beta
class PERBuffer: # Simple class that holds the different types of memory class Memory: # Expects all shapes to be tuples, size to be an integer def __init__(self, state_shape, action_shape, size): self.states = np.zeros((size, ) + state_shape) self.actions = np.zeros((size, ) + action_shape) self.rewards = np.zeros(size) self.next_states = np.zeros((size, ) + state_shape) self.dones = np.zeros(size) self.size = size # memory[i] will return a tuple of the entire memory @ i def __getitem__(self, key): return (self.states[key], self.actions[key], self.rewards[key], self.next_states[key], self.dones[key]) # Provides a quick way of updating multiple # parts of memory at a specific index def update(self, indx, state=None, action=None, reward=None, next_state=None, done=None): self.states[indx] = state self.actions[indx] = action self.rewards[indx] = reward self.next_states[indx] = next_state self.dones[indx] = done # An alternative to __getitem__, returns dict instead def get(self, key): rtn = { "states": self.states[key], "actions": self.actions[key], "rewards": self.rewards[key], "next_states": self.next_states[key], "dones": self.dones[key] } return rtn # Creates the replay buffer def __init__(self, state_shape, action_shape, size, alpha=0.6, beta=0.4, beta_delta=0.001, epsilon=0.01): self.memory = self.Memory(state_shape, action_shape, size) self.counter = 0 self.size = self.memory.size # Segment trees self.sum_tree = SumSegmentTree(self.size) self.min_tree = MinSegmentTree(self.size) # P.E.R. hyperparameters self.alpha = alpha self.beta = beta self.beta_delta = beta_delta self.epsilon = epsilon self.max_priority = 1.0 # Samples the indexes from memory in accordance to their priority def sample_indexes(self, batch_size, max_memory): sample_indexes = np.zeros(shape=batch_size) # Gets the total probability of all used memory prob_total_norm = self.sum_tree.sum(0, max_memory - 1) / batch_size # Gets indexes using probability for i in range(batch_size): # ---VAL MAY NEED TO BE CHANGED--- val = random.random() * prob_total_norm + i * prob_total_norm indx = self.sum_tree.find_prefixsum_idx(val) sample_indexes[i] = indx return sample_indexes # Stores new memory at looping index def store(self, state, action, reward, next_state, done): indx = self.counter % self.size self.memory.update(indx, state, action, reward, next_state, done) # Gets the priority alpha for the newly added sample priority_alpha = self.max_priority**self.alpha # Adds this to the sum and min trees self.sum_tree[indx] = priority_alpha self.min_tree[indx] = priority_alpha # Updates the counter self.counter += 1 # Samples the memory from filled parts of the buffer # Returns a tuple (states, actions, rewards, next_states, dones, weights) def miniBatch(self, batch_size): max_memory = min(self.counter, self.size) # Samples the indexes according to their importance batch_indxs = self.sample_indexes(batch_size, max_memory) batch_indxs = np.int_(batch_indxs) # Gets the weights weights = np.zeros(shape=batch_size) prob_min = self.min_tree.min() / (self.sum_tree.sum() + self.epsilon) max_weight = (prob_min * max_memory)**(-self.beta) for i in range(0, len(batch_indxs)): prob = self.sum_tree[batch_indxs[i]] / \ (self.sum_tree.sum() + self.epsilon) weight = (prob * max_memory)**(-self.beta) weight_norm = weight / (max_weight + self.epsilon) weights[i] = weight_norm # Updates beta self.beta = min(1.0, self.beta + self.beta_delta) # Returns memory and weights and idxs return self.memory[batch_indxs] + (weights, ) + (batch_indxs, ) # for given indexes and priorities, updates the trees and max def update_priorities(self, indxs, priorities): for indx, priority in zip(indxs, priorities): priority_alpha = priority**self.alpha self.sum_tree[indx] = priority_alpha[0] self.min_tree[indx] = priority_alpha[0] # Sets the max priority to be newest max self.max_priority = max(self.max_priority, priority)
class ReplayMemory: def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0 def add(self, data): #new_data = [] #for i in data: # i.wait_to_read() # new_data.append(copyto(i)) if self._next_idx >= len(self._storage): self._storage.append(data) #print self._storage else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize idx = self._next_idx self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta=0.4): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) #print self._it_min.min(), weights weights = np.array(weights) weights /= np.sum(weights) ret = [] for i in xrange(batch_size): ret.append(self._storage[idxes[i]]) return (ret, idxes, weights) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): #print priority assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplay(Replay): def __init__(self, learner_config, env_config, session_config): """ Create Prioritized Replay buffer. :param size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. :param alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) """ super(PrioritizedReplayBuffer, self).__init__(learner_config=learner_config, env_config=env_config, session_config=session_config) self._alpha = self.replay_config.alpha assert self._alpha > 0 self._memory = [] self.memory_size = self.replay_config.memory_size it_capacity = 1 while it_capacity < self.memory_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def default_config(self): conf = super().default_config() conf.update({ 'memory_size': '_int_', 'sampling_start_size': '_int_', 'alpha': '_float_', }) return conf def insert(self, exp_dict): """ Adds experience to the replay buffer as usual, but also intiialize the priority of the new experience. """ with self.insert_time.time(): if self._next_idx >= len(self._memory): self._memory.append(exp_dict) else: self._memory[self._next_idx] = exp_dict self._next_idx = (self._next_idx + 1) % self.memory_size idx = self._next_idx self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def sample(self, batch_size, beta=0): """ WARNING: This function does not make deep copies of the tuple experiences. This means that if any objects in the experiences are modified, the contents of the replay buffer memory will also be modified, so be careful!!! Sample a batch of experiences, along with their importance weights, and the indices of the sampled experiences in the buffer. :param batch_size: int How many transitions to sample. :param beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) :return experience_batch: List List of tuples, length batch_size, corresponding to the experiences sampled. :return weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition :return indices: np.array Array of shape (batch_size,) and dtype np.int32 indices in buffer of sampled experiences """ with self.sample_time.time(): assert beta >= 0 # sample the experiences proportional to their priorities indices = self._sample_proportional(batch_size) response = [self._storage[idx] for idx in indices] # compute importance weights for the experiences to correct for distribution shift weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in indices: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) # return response, weights, indices return response def _sample_proportional(self, batch_size): """ This is a helper function to sample expriences with probabilities proportional to their priorities. Returns a list of indices. """ res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def update_priorities(self, indices, priorities): """ Update priorities of sampled transitions. sets priority of transition at index indices[i] in buffer to priorities[i]. :param indices: [int] List of indices of sampled transitions :param priorities: [float] List of updated priorities corresponding to transitions at the sampled indices denoted by variable `indices`. """ assert len(indices) == len(priorities) for idx, priority in zip(indices, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) def evict(self): raise NotImplementedError # TODO # if evict_size > len(self._memory): # evicted = self._memory # self._memory = [] # self._next_idx = 0 # return evicted # forward_space = len(self._memory) - self._next_idx # if evict_size < forward_space: # evicted = self._memory[self._next_idx:self._next_idx+evict_size] # del self._memory[self._next_idx:self._next_idx+evict_size] # else: # evicted = self._memory[self._next_idx:] # evict_from_left = evict_size - forward_space # evicted += self._memory[:evict_from_left] # del self._memory[self._next_idx:] # del self._memory[:evict_from_left] # self._next_idx -= evict_from_left # assert len(evicted) == evict_size # return evicted def start_sample_condition(self): return len(self) > self.replay_config.sampling_start_size def __len__(self): return len(self._memory)
class PrioritizedReplayBuffer(SimpleReplayBuffer): def __init__(self, MAX_LEN, alpha: float = 0.6): """ Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- SimpleReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(MAX_LEN) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < MAX_LEN: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, experience: Experience): """See SimpleReplayBuffer.store_effect""" idx = self._next_idx super().add(experience) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size: int) -> List[int]: res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample( self, batch_size: int, beta: float = 0.4 ) -> Tuple[List[Experience], np.ndarray, List[int]]: # type: ignore """Sample a batch of experiences. compared to SimpleReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- experiences: List[Experience] batch of experiences weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return (encoded_sample, weights, idxes) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)
class ReplayMemory(ReplayBuffer): def __init__(self, size, alpha): super().__init__(size, N_Step_Transition) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def put(self, transitions, priorities): idxes = [] for transition in transitions: idx = self.next_idx super().put(transition) idxes.append(idx) self.update_priorities(idxes, priorities) def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self.buffer) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def _encode_sample(self, idxes): s_lst, a_lst, r_lst, next_state_lst, done_mask_lst = [], [], [], [], [] n_step_transitions = N_Step_Transition(*zip( *[self.buffer[index] for index in idxes])) # 'S_t', 'A_t', 'R_ttpB', 'Gamma_ttpB', 'qS_t', 'S_tpn', 'qS_tpn', 'key' S_t = np.array(n_step_transitions.S_t) S_tpn = np.array(n_step_transitions.S_tpn) R_ttpB = np.array(n_step_transitions.R_ttpB) gamma_ttpB = np.array(n_step_transitions.Gamma_ttpB) qS_tpn = np.array(n_step_transitions.qS_tpn) A_t = np.array(n_step_transitions.A_t, dtype=np.int) qS_t = np.array(n_step_transitions.qS_t) key = np.array(n_step_transitions.key) return S_t, A_t, R_ttpB, S_tpn, gamma_ttpB, qS_tpn, qS_t, key def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self.buffer))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self.buffer))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self.buffer) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) def remove_old_experience(self): if self.size() > self.maxsize: num_excess = self.size() - self.maxsize # FIFO del self.buffer[:num_excess]
class PrioritizedReplayMemory: def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000): super(PrioritizedReplayMemory, self).__init__() self._storage = [] self._maxsize = size self._next_idx = 0 assert alpha >= 0 self._alpha = alpha self.beta_start = beta_start self.beta_frames = beta_frames self.frame = 1 it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def beta_by_frame(self, frame_idx): return min( 1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames) def push(self, state, action, reward, next_state, done): idx = self._next_idx exp = self.experience(state, action, reward, next_state, done) if self._next_idx >= len(self._storage): self._storage.append(exp) else: self._storage[self._next_idx] = exp self._next_idx = (self._next_idx + 1) % self._maxsize self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _encode_sample(self, idxes): states = torch.from_numpy( np.array([self._storage[i].state for i in idxes])).float().to(device) actions = torch.from_numpy( np.array([self._storage[i].action for i in idxes])).float().to(device) rewards = torch.from_numpy( np.array([self._storage[i].reward for i in idxes])).float().to(device) next_states = torch.from_numpy( np.array([self._storage[i].next_state for i in idxes])).float().to(device) dones = torch.from_numpy( np.array([self._storage[i].done for i in idxes]).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones) def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size): idxes = self._sample_proportional(batch_size) weights = [] #find smallest sampling prob: p_min = smallest priority^alpha / sum of priorities^alpha p_min = self._it_min.min() / self._it_sum.sum() beta = self.beta_by_frame(self.frame) self.frame += 1 #max_weight given to smallest prob max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = torch.tensor(weights, device=device, dtype=torch.float) encoded_sample = self._encode_sample(idxes) return encoded_sample, idxes, weights def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert 0 <= idx < len(self._storage) self._it_sum[idx] = (priority + 1e-5)**self._alpha self._it_min[idx] = (priority + 1e-5)**self._alpha self._max_priority = max(self._max_priority, (priority + 1e-5))
class ReplayMemory: def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0 def add(self, data): #new_data = [] #for i in data: # i.wait_to_read() # new_data.append(copyto(i)) if self._next_idx >= len(self._storage): self._storage.append(data) #print self._storage else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize idx = self._next_idx self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta=0.4): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) #print self._it_min.min(), weights weights = np.array(weights) weights /= np.sum(weights) ret = [] for i in xrange(batch_size): ret.append(self._storage[idxes[i]]) return (ret, idxes, weights) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) #print priorities, np.sum(priorities) for idx, priority in zip(idxes, priorities): #print priority assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, buffer_size, input_dim, batch_size, alpha): super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim, batch_size) # For PER. Parameter settings. self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha tree_capacity = 1 while tree_capacity < self.buffer_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) def store(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: int): super().store(state, action, reward, next_state, done) self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha self.min_tree[self.tree_ptr] = self.max_priority**self.alpha self.tree_ptr = (self.tree_ptr + 1) % self.buffer_size def batch_load(self, beta): # indices를 받아오는 부분은 병렬처리!!, 그리고 같은 함수에서 weight도 받을 수 있다. indices = self._sample_proportional_indices() weights = np.array( [self._calculate_weight(idx, beta) for idx in indices]) return dict(states=self.state_buffer[indices], actions=self.action_buffer[indices], rewards=self.reward_buffer[indices], next_states=self.next_state_buffer[indices], dones=self.done_buffer[indices], weights=weights, indices=indices) def update_priorities(self, indices, priorities): # 이 부분도 병렬 처리 할 수 있는 구간. for idx, priority in zip(indices, priorities): self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def _sample_proportional_indices(self): indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size # multiprocessing 등을 활용해서 병렬처리 하자 for i in range(self.batch_size): a = segment * i b = segment * (i + 1) sample = np.random.uniform(a, b) idx = self.sum_tree.retrieve(sample) # sample의 tree에서의 idx를 리턴 indices.append(idx) return indices def _calculate_weight(self, idx, beta): # 이 부분은 batch 당 weight 구할 때 한번만 하면 될듯. p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) weight /= max_weight return weight
class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, alpha): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch alpha (float): alpha PER value """ self.max_priority = 1.0 self.alpha = alpha # capacity must be positive and a power of 2. self.tree_capacity = 1 while self.tree_capacity < buffer_size: self.tree_capacity *= 2 self.sum_tree = SumSegmentTree(self.tree_capacity) self.min_tree = MinSegmentTree(self.tree_capacity) self.action_size = action_size self.memory = [] self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, t, state, action, reward, next_state, done): """Add a new experience to memory.""" e = self.experience(state, action, reward, next_state, done) idx = t % self.tree_capacity if t >= self.tree_capacity: self.memory[idx] = e else: self.memory.append(e) # insert experience index in priority tree self.sum_tree[idx] = self.max_priority**self.alpha self.min_tree[idx] = self.max_priority**self.alpha def sample(self, beta): """Sampling a batch of relevant experiences from memory.""" indices = self.relevant_sample_indx() idxs = np.vstack(indices).astype(np.int) states = torch.from_numpy( np.vstack([self.memory[i].state for i in indices])).float().to(device) actions = torch.from_numpy( np.vstack([self.memory[i].action for i in indices])).long().to(device) rewards = torch.from_numpy( np.vstack([self.memory[i].reward for i in indices])).float().to(device) next_states = torch.from_numpy( np.vstack([self.memory[i].next_state for i in indices])).float().to(device) dones = torch.from_numpy( np.vstack([self.memory[i].done for i in indices]).astype(np.uint8)).float().to(device) weights = torch.from_numpy( np.array([self.isw(i, beta) for i in indices])).float().to(device) return (idxs, states, actions, rewards, next_states, dones, weights) def relevant_sample_indx(self): """Selecting most informative sample indices.""" indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size for i in range(self.batch_size): a = segment * i b = segment * (i + 1) upperbound = random.uniform(a, b) idx = self.sum_tree.retrieve(upperbound) indices.append(idx) return indices def update_priorities(self, indices, priorities): """Update priorities of sampled transitions.""" assert indices.shape[0] == priorities.shape[0] for idx, priority in zip(indices.flatten(), priorities.flatten()): assert priority > 0 assert 0 <= idx < len(self) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def isw(self, idx, beta): """Compute Importance Sample Weight.""" # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) # calculate weights p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) is_weight = weight / max_weight return is_weight def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
class PrioritizedReplayBuffer(object): def __init__(self, size, alpha=0.6): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ #super(PrioritizedReplayBuffer, self).__init__(size) self._storage = [] self._maxsize = size self._next_idx = 0 assert alpha >= 0 self._alpha = alpha self.it_capacity = 1 while self.it_capacity < size * 2: # We use double the soft capacity of the PER for the segment trees to allow for any overflow over the soft capacity limit before samples are removed self.it_capacity *= 2 self._it_sum = SumSegmentTree(self.it_capacity) self._it_min = MinSegmentTree(self.it_capacity) self._max_priority = 1.0 def _add(self, obs_t, action, reward, obs_tp1, done): # self, state, policy_output, reward, last_state, done data = (obs_t, action, reward, obs_tp1, done) if self._next_idx >= len(self._storage): self._storage.append(data) else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize def _remove(self, num_samples): del self._storage[:num_samples] self._next_idx = len(self._storage) def _encode_sample(self, idxes): obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] for i in idxes: data = self._storage[i] obs_t, action, reward, obs_tp1, done = data obses_t.append(np.array(obs_t, copy=False)) actions.append(action) rewards.append(reward) obses_tp1.append(np.array(obs_tp1, copy=False)) dones.append(done) return [np.array(obses_t), actions, np.array(rewards),\ np.array(obses_tp1), np.array(dones)] def add(self, state, policy_output, reward, last_state, done): # self, state, policy_output, reward, last_state, done idx = self._next_idx #assert idx < self.it_capacity, "Number of samples in replay memory exceeds capacity of segment trees. Please increase capacity of segment trees or increase the frequency at which samples are removed from the replay memory" self._add(state, policy_output, reward, last_state, done) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def remove(self, num_samples): self._remove(num_samples) self._it_sum.remove_items(num_samples) self._it_min.remove_items(num_samples) def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def miniBatch(self, batch_size, beta=0.4, epsilon=1e-8): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. gammas: np.array product of gammas for N-step returns weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / (self._it_sum.sum() + epsilon) max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / (self._it_sum.sum() + epsilon) weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / (max_weight + epsilon)) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return encoded_sample, idxes, weights def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority) def get_size(self): return len(self._storage)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayMemory(object): def __init__(self, capacity=100000, priority_fraction=0.0, discount_gamma_game_reward=1.0, discount_gamma_graph_reward=1.0, discount_gamma_count_reward=1.0, accumulate_reward_from_final=False): # prioritized replay memory self._storage = [] self.capacity = capacity self._next_idx = 0 assert priority_fraction >= 0 self._alpha = priority_fraction it_capacity = 1 while it_capacity < capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.discount_gamma_game_reward = discount_gamma_game_reward self.discount_gamma_graph_reward = discount_gamma_graph_reward self.discount_gamma_count_reward = discount_gamma_count_reward self.accumulate_reward_from_final = accumulate_reward_from_final def __len__(self): return len(self._storage) @property def storage(self): """[(np.ndarray, float, float, np.ndarray, bool)]: content of the replay buffer""" return self._storage @property def buffer_size(self): """float: Max capacity of the buffer""" return self.capacity def can_sample(self, n_samples): """ Check if n_samples samples can be sampled from the buffer. :param n_samples: (int) :return: (bool) """ return len(self) >= n_samples def is_full(self): """ Check whether the replay buffer is full or not. :return: (bool) """ return len(self) == self.buffer_size def add(self, *args): """ add a new transition to the buffer """ idx = self._next_idx data = Transition(*args) if self._next_idx >= len(self._storage): self._storage.append(data) else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self.capacity self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def get_next_final_pos(self, which_memory, head): i = head while True: if i >= len(self._storage): return None if self._storage[i].is_final: return i i += 1 return None def _get_single_transition(self, idx, n): assert n > 0 head = idx # if n is 1, then head can't be is_final if n == 1: if self._storage[head].is_final: return None # if n > 1, then all except tail can't be is_final else: if np.any([item.is_final for item in self._storage[head:head + n]]): return None next_final = self.get_next_final_pos(self._storage, head) if next_final is None: return None # all good obs = self._storage[head].observation_list prev_action = self._storage[head].prev_action_list candidate = self._storage[head].action_candidate_list chosen_indices = self._storage[head].chosen_indices graph_triplets = self._storage[head].graph_triplets next_obs = self._storage[head + n].observation_list next_prev_action = self._storage[head + n].prev_action_list next_candidate = self._storage[head + n].action_candidate_list next_graph_triplets = self._storage[head + n].graph_triplets tmp = next_final - head + 1 if self.accumulate_reward_from_final else n + 1 rewards_up_to_next_final = [ self.discount_gamma_game_reward**i * self._storage[head + i].reward for i in range(tmp) ] reward = torch.sum(torch.stack(rewards_up_to_next_final)) graph_rewards_up_to_next_final = [ self.discount_gamma_graph_reward**i * self._storage[head + i].graph_reward for i in range(tmp) ] graph_reward = torch.sum(torch.stack(graph_rewards_up_to_next_final)) count_rewards_up_to_next_final = [ self.discount_gamma_count_reward**i * self._storage[head + i].count_reward for i in range(tmp) ] count_reward = torch.sum(torch.stack(count_rewards_up_to_next_final)) return (obs, prev_action, candidate, chosen_indices, graph_triplets, reward + graph_reward + count_reward, next_obs, next_prev_action, next_candidate, next_graph_triplets) def _encode_sample(self, idxes, ns): actual_indices, actual_ns = [], [] obs, prev_action, candidate, chosen_indices, graph_triplets, reward, next_obs, next_prev_action, next_candidate, next_graph_triplets = [], [], [], [], [], [], [], [], [], [] for i, n in zip(idxes, ns): t = self._get_single_transition(i, n) if t is None: continue actual_indices.append(i) actual_ns.append(n) obs.append(t[0]) prev_action.append(t[1]) candidate.append(t[2]) chosen_indices.append(t[3]) graph_triplets.append(t[4]) reward.append(t[5]) next_obs.append(t[6]) next_prev_action.append(t[7]) next_candidate.append(t[8]) next_graph_triplets.append(t[9]) if len(actual_indices) == 0: return None chosen_indices = np.array(chosen_indices) # batch reward = torch.stack(reward, 0) # batch actual_ns = np.array(actual_ns) return [ obs, prev_action, candidate, chosen_indices, graph_triplets, reward, next_obs, next_prev_action, next_candidate, next_graph_triplets, actual_indices, actual_ns ] def sample(self, batch_size, beta=0, multi_step=1): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) # sample n ns = np.random.randint(1, multi_step + 1, size=batch_size) encoded_sample = self._encode_sample(idxes, ns) if encoded_sample is None: return None actual_indices = encoded_sample[-2] for idx in actual_indices: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) return encoded_sample + [weights] def _get_single_sequence_transition(self, idx, sample_history_length): assert sample_history_length > 0 head = idx # if n is 1, then head can't be is_final if sample_history_length == 1: if self._storage[head].is_final: return None # if n > 1, then all except tail can't be is_final else: if np.any([ item.is_final for item in self._storage[head:head + sample_history_length] ]): return None next_final = self.get_next_final_pos(self._storage, head) if next_final is None: return None # all good res = [] for m in range(sample_history_length): obs = self._storage[head + m].observation_list candidate = self._storage[head + m].action_candidate_list chosen_indices = self._storage[head + m].chosen_indices graph_triplets = self._storage[head + m].graph_triplets next_obs = self._storage[head + m + 1].observation_list next_candidate = self._storage[head + m + 1].action_candidate_list next_graph_triplets = self._storage[head + m + 1].graph_triplets tmp = next_final - ( head + m) + 1 if self.accumulate_reward_from_final else 1 rewards_up_to_next_final = [ self.discount_gamma_game_reward**i * self._storage[head + m + i].reward for i in range(tmp) ] reward = torch.sum(torch.stack(rewards_up_to_next_final)) graph_rewards_up_to_next_final = [ self.discount_gamma_graph_reward**i * self._storage[head + m + i].graph_reward for i in range(tmp) ] graph_reward = torch.sum( torch.stack(graph_rewards_up_to_next_final)) count_rewards_up_to_next_final = [ self.discount_gamma_count_reward**i * self._storage[head + m + i].count_reward for i in range(tmp) ] count_reward = torch.sum( torch.stack(count_rewards_up_to_next_final)) res.append([ obs, candidate, chosen_indices, graph_triplets, reward + graph_reward + count_reward, next_obs, next_candidate, next_graph_triplets ]) return res def _encode_sample_sequence(self, idxes, sample_history_length): assert sample_history_length > 0 res = [] for _ in range(sample_history_length): tmp = [] for i in range(8): tmp.append([]) res.append(tmp) actual_indices = [] # obs, candidate, chosen_indices, graph_triplets, reward, next_obs, next_candidate, next_graph_triplets for i in idxes: t = self._get_single_sequence_transition(i, sample_history_length) if t is None: continue actual_indices.append(i) for step in range(sample_history_length): t_s = t[step] res[step][0].append(t_s[0]) res[step][1].append(t_s[1]) res[step][2].append(t_s[2]) res[step][3].append(t_s[3]) res[step][4].append(t_s[4]) res[step][5].append(t_s[5]) res[step][6].append(t_s[6]) res[step][7].append(t_s[7]) if len(actual_indices) == 0: return None for i in range(sample_history_length): res[i][2] = np.array(res[i][2]) # batch res[i][4] = torch.stack(res[i][4], 0) # batch return res + [actual_indices] def sample_sequence(self, batch_size, beta=0, sample_history_length=1): assert beta > 0 idxes = self._sample_proportional(batch_size) res_weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) encoded_sample = self._encode_sample_sequence(idxes, sample_history_length) if encoded_sample is None: return None actual_indices = encoded_sample[-1] for _h in range(sample_history_length): tmp_weights = [] for idx in actual_indices: p_sample = self._it_sum[idx + _h] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) tmp_weights.append(weight / max_weight) tmp_weights = np.array(tmp_weights) res_weights.append(tmp_weights) return encoded_sample + [res_weights] def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def update_priorities(self, idxes, priorities): """ Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. :param idxes: ([int]) List of idxes of sampled transitions :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): if priority > 0: assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) else: print("something wrong with priority: ", str(priority)) return False return True def avg_rewards(self): if len(self._storage) == 0: return 0.0 rewards = [self._storage[i].reward for i in range(len(self._storage))] return to_np(torch.mean(torch.stack(rewards)))
class PrioritizedReplayBuffer(ReplayBuffer): """Prioritized Replay buffer. Attributes: max_priority (float): max priority tree_ptr (int): next index of tree alpha (float): alpha parameter for prioritized replay buffer sum_tree (SumSegmentTree): sum tree for prior min_tree (MinSegmentTree): min tree for min prior to get max weight """ def __init__( self, obs_dim: int, size: int, batch_size: int = 32, alpha: float = 0.6, n_step: int = 1, gamma: float = 0.99, ): """Initialization.""" assert alpha >= 0 super(PrioritizedReplayBuffer, self).__init__(obs_dim, size, batch_size, n_step, gamma) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha # capacity must be positive and a power of 2. tree_capacity = 1 while tree_capacity < self.max_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) def store( self, obs: np.ndarray, act: int, rew: float, next_obs: np.ndarray, done: bool, ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]: """Store experience and priority.""" transition = super().store(obs, act, rew, next_obs, done) if transition: self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha self.min_tree[self.tree_ptr] = self.max_priority**self.alpha self.tree_ptr = (self.tree_ptr + 1) % self.max_size return transition def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]: """Sample a batch of experiences.""" assert len(self) >= self.batch_size assert beta > 0 indices = self._sample_proportional() obs = self.obs_buf[indices] next_obs = self.next_obs_buf[indices] acts = self.acts_buf[indices] rews = self.rews_buf[indices] done = self.done_buf[indices] weights = np.array([self._calculate_weight(i, beta) for i in indices]) return dict( obs=obs, next_obs=next_obs, acts=acts, rews=rews, done=done, weights=weights, indices=indices, ) def update_priorities(self, indices: List[int], priorities: np.ndarray): """Update priorities of sampled transitions.""" assert len(indices) == len(priorities) for idx, priority in zip(indices, priorities): assert priority > 0 assert 0 <= idx < len(self) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def _sample_proportional(self) -> List[int]: """Sample indices based on proportions.""" indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size for i in range(self.batch_size): a = segment * i b = segment * (i + 1) upperbound = random.uniform(a, b) idx = self.sum_tree.retrieve(upperbound) indices.append(idx) return indices def _calculate_weight(self, idx: int, beta: float): """Calculate the weight of the experience at idx.""" # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) # calculate weights p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) weight = weight / max_weight return weight
class PrioritizedReplayBuffer(ReplayBuffer): """ Adapt from https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/buffers.py """ def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False): super().__init__(obs_space, action_space, capacity, device, optimize_memory_usage=optimize_memory_usage) assert exponent >= 0 self.exponent = exponent it_capacity = 1 while it_capacity < self.capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def _sample_proportional(self, batch_size): total = self._it_sum.sum(0, len(self) - 1) mass = np.random.random(size=batch_size) * total idx = self._it_sum.find_prefixsum_idx(mass) # replace idx == self.idx if self.full and self.optimize_memory_usage: while np.any(idx == self.idx): replace_mass = np.random.random(len(idx == self.idx)) * total replace_idx = self._it_sum.find_prefixsum_idx(replace_mass) idx[idx == self.idx] = replace_idx return idx def add(self, obs, action, reward, next_obs, done): idx = self.idx super().add(obs, action, reward, next_obs, done) self._it_sum[idx] = self._max_priority ** self.exponent self._it_min[idx] = self._max_priority ** self.exponent def sample(self, batch_size, beta=0): assert beta >= 0 idxes = self._sample_proportional(batch_size) p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self)) ** (-beta) p_sample = self._it_sum[idxes] / self._it_sum.sum() weights = (p_sample * len(self)) ** (-beta) / max_weight obses, actions, rewards, next_obses, not_dones = self._sample(idxes) priority_kwargs = { 'weights': weights, 'idxes': idxes } return obses, actions, rewards, next_obses, not_dones, priority_kwargs def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) assert np.min(priorities) > 0 assert np.min(idxes) >= 0 assert np.max(idxes) < len(self) self._it_sum[idxes] = priorities ** self.exponent self._it_min[idxes] = priorities ** self.exponent self._max_priority = max(self._max_priority, np.max(priorities))