def __init__(self, capacity=100000, priority_fraction=0.0, discount_gamma_game_reward=1.0, discount_gamma_graph_reward=1.0, discount_gamma_count_reward=1.0, accumulate_reward_from_final=False): # prioritized replay memory self._storage = [] self.capacity = capacity self._next_idx = 0 assert priority_fraction >= 0 self._alpha = priority_fraction it_capacity = 1 while it_capacity < capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.discount_gamma_game_reward = discount_gamma_game_reward self.discount_gamma_graph_reward = discount_gamma_graph_reward self.discount_gamma_count_reward = discount_gamma_count_reward self.accumulate_reward_from_final = accumulate_reward_from_final
def __init__(self, memory_size=1000000, alpha=0.5, seed=None): ''' Prioritized replay buffer from https://arxiv.org/pdf/1511.05952.pdf This implementation is based on the OpenAI sumtree implemenation which can be found here https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py memory_size: int maximum number of experiences to store alpha: float, [0.0, 1.0] hyperparameter that controls the amount of prioritization, with 0.0 being no prioritization (the uniform case) seed: None or int random seed for the replay buffer ''' super().__init__(memory_size=memory_size, seed=seed) self.alpha = alpha it_capacity = 1 while it_capacity < self._memory_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"): """Initialize a PrioritizedReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization) beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction) """ super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device) self.alpha = alpha self.beta = beta self._eps = 0.00000001 it_capacity = 1 while it_capacity < buffer_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__( self, obs_dim: list, size: int, device: str, batch_size: int = 32, alpha: float = 0.6, n_step: int = 1, gamma: float = 0.99, ): """Initialization.""" assert alpha >= 0 super(PrioritizedReplayBuffer, self).__init__(obs_dim, size, device, batch_size, n_step, gamma) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha # capacity must be positive and a power of 2. tree_capacity = 1 while tree_capacity < self.max_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity)
def __init__(self, action_size, buffer_size, batch_size, alpha): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch alpha (float): alpha PER value """ self.max_priority = 1.0 self.alpha = alpha # capacity must be positive and a power of 2. self.tree_capacity = 1 while self.tree_capacity < buffer_size: self.tree_capacity *= 2 self.sum_tree = SumSegmentTree(self.tree_capacity) self.min_tree = MinSegmentTree(self.tree_capacity) self.action_size = action_size self.memory = [] self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"])
def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, learner_config, env_config, session_config): """ Create Prioritized Replay buffer. :param size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. :param alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) """ super(PrioritizedReplayBuffer, self).__init__(learner_config=learner_config, env_config=env_config, session_config=session_config) self._alpha = self.replay_config.alpha assert self._alpha > 0 self._memory = [] self.memory_size = self.replay_config.memory_size it_capacity = 1 while it_capacity < self.memory_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, size, alpha=0.6): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ #super(PrioritizedReplayBuffer, self).__init__(size) self._storage = [] self._maxsize = size self._next_idx = 0 assert alpha >= 0 self._alpha = alpha self.it_capacity = 1 while self.it_capacity < size * 2: # We use double the soft capacity of the PER for the segment trees to allow for any overflow over the soft capacity limit before samples are removed self.it_capacity *= 2 self._it_sum = SumSegmentTree(self.it_capacity) self._it_min = MinSegmentTree(self.it_capacity) self._max_priority = 1.0
def __init__(self, capacity, gamma=0.99, n_steps=2, alpha=0.5): super(PriorityBuffer, self).__init__(capacity, gamma, n_steps) self.buffer = [] self.position = 0 self.alpha = alpha it_cap = 1 while it_cap < capacity: it_cap *= 2 self._it_sum = SumSegmentTree(it_cap) self._it_min = MinSegmentTree(it_cap) self._max_priority = 1.0
def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6): super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed) #capacity must be positive and a power of 2 tree_capacity = 1 while tree_capacity < self.buffer_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha
def __init__(self, size, alpha): super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, size, alpha): super().__init__(size, N_Step_Transition) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False): super().__init__(obs_space, action_space, capacity, device, optimize_memory_usage=optimize_memory_usage) assert exponent >= 0 self.exponent = exponent it_capacity = 1 while it_capacity < self.capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, size, alpha): super(ProportionalReplay, self).__init__(size) assert alpha >= 0 self.alpha = alpha self.tree_size = 1 while self.tree_size < self.maxsize: self.tree_size *= 2 self.min_tree = MinSegmentTree( self.tree_size) # for calculating maximum IS weight self.sum_tree = SumSegmentTree( self.tree_size) # for proportional sampling self.max_priority = 1.0 # maximum priority we've seen so far. will be updated
def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0
def __init__(self, buffer_size, input_dim, batch_size, alpha): super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim, batch_size) # For PER. Parameter settings. self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha tree_capacity = 1 while tree_capacity < self.buffer_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity)
def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes, lock): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.episode_rewards = [] self.episode_lengths = [] self.episode_losses = [] self.episode_mean_values = [] self.lock = lock it_capacity = 1 while it_capacity < max_memory: it_capacity *= 2 self._it_sum = [SumSegmentTree(it_capacity)] self._it_min = [MinSegmentTree(it_capacity)] self.pre_t_m_loss = 1e5 self.unpermit = True # self.replaymemory = ReplayMemory(max_memory) global worker_num # self.local_AC = AC_Network(sess, s_size, a_size, self.name, None) self.local_AC = AC_Network(sess, s_size, a_size, self.name, self.trainer, self._it_sum, self._it_min) worker_num += 1 self.update_local_ops = update_target_graph(self.local_AC.target_scope, self.name) self.update_to_global_ops = update_target_graph(self.name, "worker_" + str(num_workers)) self.update_ops = [[update_target_graph('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)] for i in range(num_workers + 1)] self.update_part_ops = [ [update_target_graph_part('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)] for i in range(num_workers + 1)] self.env = env
def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, size, alpha): """ Prioritied Experience Replay """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha # I don't understand purpose of this # maybe to create a graph to store ranked truples? it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
def __init__(self, state_shape, action_shape, size, alpha=0.6, beta=0.4, beta_delta=0.001, epsilon=0.01): self.memory = self.Memory(state_shape, action_shape, size) self.counter = 0 self.size = self.memory.size # Segment trees self.sum_tree = SumSegmentTree(self.size) self.min_tree = MinSegmentTree(self.size) # P.E.R. hyperparameters self.alpha = alpha self.beta = beta self.beta_delta = beta_delta self.epsilon = epsilon self.max_priority = 1.0
def __init__(self, size, state_shape, alpha, n_batch_trajectories, n_trajectory_steps, n_emus=1): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. dsize: int Max number of demonstration transitions. These are retained in the buffer permanently. https://arxiv.org/abs/1704.03732 alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size, state_shape, n_batch_trajectories, n_trajectory_steps, n_emus=n_emus) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < self._size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, obs_dim=obs_dim, size=size, batch_size=batch_size, alpha=alpha, n_step=n_step, gamma=gamma): """Initialization.""" assert alpha >= 0 super(PrioritizedReplayBuffer, self).__init__(obs_dim, size, batch_size, n_step, gamma) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha # capacity must be positive and a power of 2. tree_capacity = 1 while tree_capacity < self.max_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity)
def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000): super(PrioritizedReplayMemory, self).__init__() self._storage = [] self._maxsize = size self._next_idx = 0 assert alpha >= 0 self._alpha = alpha self.beta_start = beta_start self.beta_frames = beta_frames self.frame = 1 it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"])
class PrioritizedReplayBuffer(ReplayBuffer): """Prioritized Replay buffer. Attributes: max_priority (float): max priority tree_ptr (int): next index of tree alpha (float): alpha parameter for prioritized replay buffer sum_tree (SumSegmentTree): sum tree for prior min_tree (MinSegmentTree): min tree for min prior to get max weight """ def __init__( self, obs_dim: int, size: int, batch_size: int = 32, alpha: float = 0.6, n_step: int = 1, gamma: float = 0.99, ): """Initialization.""" assert alpha >= 0 super(PrioritizedReplayBuffer, self).__init__(obs_dim, size, batch_size, n_step, gamma) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha # capacity must be positive and a power of 2. tree_capacity = 1 while tree_capacity < self.max_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) def store( self, obs: np.ndarray, act: int, rew: float, next_obs: np.ndarray, done: bool, ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]: """Store experience and priority.""" transition = super().store(obs, act, rew, next_obs, done) if transition: self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha self.min_tree[self.tree_ptr] = self.max_priority**self.alpha self.tree_ptr = (self.tree_ptr + 1) % self.max_size return transition def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]: """Sample a batch of experiences.""" assert len(self) >= self.batch_size assert beta > 0 indices = self._sample_proportional() obs = self.obs_buf[indices] next_obs = self.next_obs_buf[indices] acts = self.acts_buf[indices] rews = self.rews_buf[indices] done = self.done_buf[indices] weights = np.array([self._calculate_weight(i, beta) for i in indices]) return dict( obs=obs, next_obs=next_obs, acts=acts, rews=rews, done=done, weights=weights, indices=indices, ) def update_priorities(self, indices: List[int], priorities: np.ndarray): """Update priorities of sampled transitions.""" assert len(indices) == len(priorities) for idx, priority in zip(indices, priorities): assert priority > 0 assert 0 <= idx < len(self) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def _sample_proportional(self) -> List[int]: """Sample indices based on proportions.""" indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size for i in range(self.batch_size): a = segment * i b = segment * (i + 1) upperbound = random.uniform(a, b) idx = self.sum_tree.retrieve(upperbound) indices.append(idx) return indices def _calculate_weight(self, idx: int, beta: float): """Calculate the weight of the experience at idx.""" # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) # calculate weights p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) weight = weight / max_weight return weight
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """ Prioritied Experience Replay """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha # I don't understand purpose of this # maybe to create a graph to store ranked truples? it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): idx = self._idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._buffer) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._buffer))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._buffer))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """ set priority of transition at index idxes[i] in buffer to priorities[i] """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._buffer) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class ReplayMemory: def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0 def add(self, data): #new_data = [] #for i in data: # i.wait_to_read() # new_data.append(copyto(i)) if self._next_idx >= len(self._storage): self._storage.append(data) #print self._storage else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize idx = self._next_idx self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta=0.4): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) #print self._it_min.min(), weights weights = np.array(weights) weights /= np.sum(weights) ret = [] for i in xrange(batch_size): ret.append(self._storage[idxes[i]]) return (ret, idxes, weights) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): #print priority assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class ProportionalReplay(ExperienceReplay): def __init__(self, size, alpha): super(ProportionalReplay, self).__init__(size) assert alpha >= 0 self.alpha = alpha self.tree_size = 1 while self.tree_size < self.maxsize: self.tree_size *= 2 self.min_tree = MinSegmentTree( self.tree_size) # for calculating maximum IS weight self.sum_tree = SumSegmentTree( self.tree_size) # for proportional sampling self.max_priority = 1.0 # maximum priority we've seen so far. will be updated def add(self, experience): idx = self.next_idx # save idx before it's changed in super call super().add( experience) # put experience data (s,a,r,s',done) in buffer # give new experience max priority to ensure it's replayed at least once self.min_tree[idx] = self.max_priority**self.alpha self.sum_tree[idx] = self.max_priority**self.alpha # To sample a minibatch of size k, the range [0, p_total] is divided equally into k ranges. # Next, a value is uniformly sampled from each range. def sample_proportional(self, batch_size): idxs = [] p_total = self.sum_tree.sum( 0, len(self.buffer) - 1) # sum of the priorities of all experience in the buffer every_range_len = p_total / batch_size # length of every range over [0,p_total] (batch_size = k) for i in range(batch_size): # for each range mass = self.np_random.uniform( ) * every_range_len + i * every_range_len # uniformly sampling a probability mass from this range idx = self.sum_tree.find_prefixsum_idx( mass ) # get smallest experience index s.t. cumulative dist F(idx) >= mass idxs.append(idx) return idxs # sample batch of experiences along with their weights and indices def sample(self, batch_size, beta): assert beta > 0 idxs = self.sample_proportional( batch_size) # sampled experience indices weights = [] p_min = self.min_tree.min() / self.sum_tree.sum( ) # minimum possible priority for a transition max_weight = (p_min * len(self.buffer))**( -beta) # (p_uniform/p_min)^beta is maximum possible IS weight # get IS weights for sampled experience for idx in idxs: p_sample = self.sum_tree[idx] / self.sum_tree.sum( ) # normalize sampled priority weight = (p_sample * len(self.buffer))**( -beta) # (p_uniform/p_sample)^beta. IS weight weights.append( weight / max_weight ) # weights normalized by max so that they only scale the update downwards weights = np.array(weights) encoded_sample = self.encode_samples( idxs) # collect experience at given indices return tuple(list(encoded_sample) + [weights, idxs]) # set the priorities of experiences at given indices def update_priorities(self, idxs, priorities): assert len(idxs) == len(priorities) for idx, priority in zip(idxs, priorities): assert priority > 0 assert 0 <= idx < len(self.buffer) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority)
class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, alpha): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch alpha (float): alpha PER value """ self.max_priority = 1.0 self.alpha = alpha # capacity must be positive and a power of 2. self.tree_capacity = 1 while self.tree_capacity < buffer_size: self.tree_capacity *= 2 self.sum_tree = SumSegmentTree(self.tree_capacity) self.min_tree = MinSegmentTree(self.tree_capacity) self.action_size = action_size self.memory = [] self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, t, state, action, reward, next_state, done): """Add a new experience to memory.""" e = self.experience(state, action, reward, next_state, done) idx = t % self.tree_capacity if t >= self.tree_capacity: self.memory[idx] = e else: self.memory.append(e) # insert experience index in priority tree self.sum_tree[idx] = self.max_priority**self.alpha self.min_tree[idx] = self.max_priority**self.alpha def sample(self, beta): """Sampling a batch of relevant experiences from memory.""" indices = self.relevant_sample_indx() idxs = np.vstack(indices).astype(np.int) states = torch.from_numpy( np.vstack([self.memory[i].state for i in indices])).float().to(device) actions = torch.from_numpy( np.vstack([self.memory[i].action for i in indices])).long().to(device) rewards = torch.from_numpy( np.vstack([self.memory[i].reward for i in indices])).float().to(device) next_states = torch.from_numpy( np.vstack([self.memory[i].next_state for i in indices])).float().to(device) dones = torch.from_numpy( np.vstack([self.memory[i].done for i in indices]).astype(np.uint8)).float().to(device) weights = torch.from_numpy( np.array([self.isw(i, beta) for i in indices])).float().to(device) return (idxs, states, actions, rewards, next_states, dones, weights) def relevant_sample_indx(self): """Selecting most informative sample indices.""" indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size for i in range(self.batch_size): a = segment * i b = segment * (i + 1) upperbound = random.uniform(a, b) idx = self.sum_tree.retrieve(upperbound) indices.append(idx) return indices def update_priorities(self, indices, priorities): """Update priorities of sampled transitions.""" assert indices.shape[0] == priorities.shape[0] for idx, priority in zip(indices.flatten(), priorities.flatten()): assert priority > 0 assert 0 <= idx < len(self) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def isw(self, idx, beta): """Compute Importance Sample Weight.""" # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) # calculate weights p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) is_weight = weight / max_weight return is_weight def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
class PrioritizedReplayBuffer(ReplayBuffer): """Fixed-size prioritized buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"): """Initialize a PrioritizedReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization) beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction) """ super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device) self.alpha = alpha self.beta = beta self._eps = 0.00000001 it_capacity = 1 while it_capacity < buffer_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" idx = self._next_idx super().add(state, action, reward, next_state, done) self._it_sum[idx] = self._max_priority ** self.alpha self._it_min[idx] = self._max_priority ** self.alpha def _sample_proportional(self): res = [] p_total = self._it_sum.sum(0, len(self.memory) - 1) every_range_len = p_total / self.batch_size for i in range(self.batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self): idxes = self._sample_proportional() weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self.memory) + self._eps) ** (-self.beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self.memory) + self._eps) ** (-self.beta) weights.append(weight / max_weight) weights = torch.tensor(weights, device=self.device, dtype=torch.float) states = torch.from_numpy(np.vstack([self.memory[i].state for i in idxes])).float().to(self.device) actions = torch.from_numpy(np.vstack([self.memory[i].action for i in idxes])).long().to(self.device) rewards = torch.from_numpy(np.vstack([self.memory[i].reward for i in idxes])).float().to(self.device) next_states = torch.from_numpy(np.vstack([self.memory[i].next_state for i in idxes])).float().to(self.device) dones = torch.from_numpy(np.vstack([self.memory[i].done for i in idxes]).astype(np.uint8)).float().to(self.device) return (states, actions, rewards, next_states, dones, idxes, weights) def update_priorities(self, indexes, priorities): """Update priorities of sampled transitions. sets priority of transition at index indexes[i] in buffer to priorities[i]. Parameters ---------- indexes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ for idx, priority in zip(indexes, priorities): self._it_sum[idx] = priority ** self.alpha self._it_min[idx] = priority ** self.alpha self._max_priority = max(self._max_priority, priority)
class ReplayMemory: def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0 def add(self, data): #new_data = [] #for i in data: # i.wait_to_read() # new_data.append(copyto(i)) if self._next_idx >= len(self._storage): self._storage.append(data) #print self._storage else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize idx = self._next_idx self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta=0.4): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) #print self._it_min.min(), weights weights = np.array(weights) weights /= np.sum(weights) ret = [] for i in xrange(batch_size): ret.append(self._storage[idxes[i]]) return (ret, idxes, weights) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) #print priorities, np.sum(priorities) for idx, priority in zip(idxes, priorities): #print priority assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)