class PERMemory: EPSILON = 0.0001 ALPHA = 0.5 BETA = 0.4 size = 0 def __init__(self, config, capacity): self.config = config self.capacity = capacity self.tree = SumTree(capacity) def _getPriority(self, td_error): return (td_error + self.EPSILON) ** self.ALPHA def push(self, transition): self.size += 1 priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, size, episode): list = [] indexes = [] weights = np.empty(size, dtype='float32') total = self.tree.total() beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes beta = min(1.0, beta) for i, rand in enumerate(np.random.uniform(0, total, size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total) ** (-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority) def __len__(self): return self.size
class Memory(object): def __init__(self, batch_size, max_size, beta): self.batch_size = batch_size # mini batch大小 self.max_size = 2**math.floor( math.log2(max_size)) # 保证 sum tree 为完全二叉树 self.beta = beta self._sum_tree = SumTree(max_size) def store_transition(self, s, a, r, s_, done): self._sum_tree.add((s, a, r, s_, done)) def get_mini_batches(self): n_sample = self.batch_size if self._sum_tree.size >= self.batch_size else self._sum_tree.size total = self._sum_tree.get_total() step = total // n_sample points_transitions_probs = [] for i in range(n_sample): v = np.random.uniform(i * step, (i + 1) * step - 1) t = self._sum_tree.sample(v) points_transitions_probs.append(t) points, transitions, probs = zip(*points_transitions_probs) # 计算重要性比率 # max_impmortance_ratio = (n_sample * self._sum_tree.get_min())**-self.beta max_impmortance_ratio = (n_sample * (self._sum_tree.get_min() + 0.0001)) max_impmortance_ratio = max_impmortance_ratio**-self.beta importance_ratio = [ (n_sample * probs[i])**-self.beta / max_impmortance_ratio for i in range(len(probs)) ] return points, tuple(np.array(e) for e in zip(*transitions)), importance_ratio def update(self, points, td_error): for i in range(len(points)): self._sum_tree.update(points[i], td_error[i])
def test_len(self): instance = SumTree(4) instance.add(p=1, data=1) self.assertEqual(len(instance), 1) instance.add(p=2, data=2) self.assertEqual(len(instance), 2) instance.add(p=3, data=3) instance.add(p=4, data=4) instance.add(p=5, data=5) self.assertEqual(len(instance), 4)
class Memory(object): e = 0.05 def __init__(self, capacity, pr_scale): self.capacity = capacity self.memory = ST(self.capacity) self.pr_scale = pr_scale self.max_pr = 0 def get_priority(self, error): return (error + self.e)**self.pr_scale def remember(self, sample, error): p = self.get_priority(error) self_max = max(self.max_pr, p) self.memory.add(self_max, sample) def sample(self, n): sample_batch = [] sample_batch_indices = [] sample_batch_priorities = [] num_segments = self.memory.total() / n for i in range(n): left = num_segments * i right = num_segments * (i + 1) s = random.uniform(left, right) idx, pr, data = self.memory.get(s) sample_batch.append((idx, data)) sample_batch_indices.append(idx) sample_batch_priorities.append(pr) return [sample_batch, sample_batch_indices, sample_batch_priorities] def update(self, batch_indices, errors): for i in range(len(batch_indices)): p = self.get_priority(errors[i]) self.memory.update(batch_indices[i], p)
class Replay_Memory: def __init__(self): global MEMORY_LEN self.tree = SumTree(MEMORY_LEN) def add(self, error, sample): global MEMORY_BIAS, MEMORY_POW priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.add(priority, sample) def sample(self): """ Get a sample batch of the replay memory Returns: batch: a batch with one sample from each segment of the memory """ global BATCH_SIZE batch = [] #we want one representative of all distribution-segments in the batch #e.g BATCH_SIZE=2: batch contains one sample from [min,median] #and from [median,max] segment = self.tree.total() / BATCH_SIZE for i in range(BATCH_SIZE): minimum = segment * i maximum = segment * (i + 1) s = random.uniform(minimum, maximum) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): """ Updates one entry in the replay memory Args: idx: the position of the outdated transition in the memory error: the newly calculated error """ priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.update(idx, priority)
class ReplayMemory(object): def __init__(self, max_size, alpha, eps): self.max_size = max_size self.alpha = alpha self.eps = eps self.tree = SumTree(max_size) self.last_idxs = None self.size = 0 def get_batch(self, batch_size): self.last_idxs = [] ret = [] for i in range(min(batch_size, self.size)): s = random.random() * self.tree.total() idx, _, data = self.tree.get(s) ret.append(pickle.loads(zlib.decompress(data))) self.last_idxs.append(idx) return ret def update(self, losses): for i in range(len(self.last_idxs)): self.tree.update(self.last_idxs[i], math.pow(losses[i] + self.eps, self.alpha)) def add_element(self, new_el, loss): self.size = min(self.max_size, self.size + 1) p = math.pow(loss + self.eps, self.alpha) self.tree.add(p, zlib.compress(pickle.dumps(new_el))) def __len__(self): return self.size
def create_tree(sample): tree = SumTree(len(sample)) for e in sample: tree.add(p=e, data=e) return tree
class PriorityMemory(SimpleMemory): PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly PER_b = 0.4 # importance-sampling, from initial value increasing to 1 PER_b_increment_per_sampling = 0.001 absolute_error_upper = 1. # clipped abs error def __init__(self, obs_dim, act_dim, size, act_dtype): SimpleMemory.__init__(self, obs_dim, act_dim, size, act_dtype) self.tree = SumTree(size) self.tree_lock = Lock() def store(self, obs, act, rew, next_obs, done): # Find the max priority max_priority = np.max(self.tree.tree[-self.tree.capacity:]) # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected # So we use a minimum priority if max_priority == 0: max_priority = self.absolute_error_upper insertion_pos = super().store(obs, act, rew, next_obs, done) self.tree_lock.acquire() insertion_pos_tree = self.tree.add( max_priority) # set the max p for new p self.tree_lock.release() assert insertion_pos == insertion_pos_tree def sample_batch(self, batch_size): #idxs = np.random.randint(0, self._size, size=batch_size) #return self.obs1_buf[idxs],self.acts_buf[idxs],self.rews_buf[idxs],self.obs2_buf[idxs],self.done_buf[idxs] mem_idxs, tree_idxs, b_ISWeights =\ np.empty((batch_size,), dtype=np.int32),\ np.empty((batch_size,), dtype=np.int32),\ np.empty((batch_size, 1), dtype=np.float32) # Calculate the priority segment # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges priority_segment = self.tree.total_priority / batch_size # priority segment # Here we increasing the PER_b each time we sample a new minibatch self.PER_b = np.min( [1., self.PER_b + self.PER_b_increment_per_sampling]) # max = 1 # Calculating the max_weight #print('### pp: {}'.format(-self.tree.capacity)) #print('### pp: {}'.format(self.tree.tree[-self.tree.capacity:])) #print('### pp: {}'.format(np.min(self.tree.tree[-self.tree.capacity:]))) #p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority p_min = self.tree.p_min assert p_min > 0 max_weight = (p_min * batch_size)**(-self.PER_b) assert max_weight > 0 for i in range(batch_size): """ A value is uniformly sample from each range """ a, b = priority_segment * i, priority_segment * (i + 1) value = np.random.uniform(a, b) """ Experience that correspond to each value is retrieved """ assert self.tree.data_pointer > 0 self.tree_lock.acquire() index, priority = self.tree.get_leaf(value) self.tree_lock.release() assert priority > 0, "### index {}".format(index) #P(j) sampling_probabilities = priority / self.tree.total_priority # IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b /max wi b_ISWeights[i, 0] = batch_size * sampling_probabilities assert b_ISWeights[i, 0] > 0 b_ISWeights[i, 0] = np.power(b_ISWeights[i, 0], -self.PER_b) b_ISWeights[i, 0] = b_ISWeights[i, 0] / max_weight mem_idxs[i] = index - self.max_size + 1 tree_idxs[i] = index #assert b_idx[i] < self.max_size , "{} and {}".format(b_idx[i], self.max_size) return self.obs1_buf[mem_idxs],\ self.acts_buf[mem_idxs],\ self.rews_buf[mem_idxs],\ self.obs2_buf[mem_idxs],\ self.done_buf[mem_idxs],\ tree_idxs,\ b_ISWeights """ Update the priorities on the tree """ def batch_update(self, tree_idx, abs_errors): abs_errors += self.PER_e # convert to abs and avoid 0 clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) ps = np.power(clipped_errors, self.PER_a) self.tree_lock.acquire() for ti, p in zip(tree_idx, ps): self.tree.update(ti, p) self.tree_lock.release()
class ReplayBuffer: def __init__(self, params): buffer_size = params['buffer_size'] batch_size = params['batch_size'] mode = params['mode'] self.__buffer_size = buffer_size self.__batch_size = batch_size self.__mode = mode self.__experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.__memory = SumTree(buffer_size) self.__memory_buffer = [] def get_batch_size(self): return self.__batch_size def is_ready(self): return len(self) >= self.__batch_size def add(self, state, action, reward, next_state, done): self.__memory_buffer.append( self.__experience(state, action, reward, next_state, done)) def sample(self): buf_len = len(self.__memory_buffer) mem_len = self.__batch_size - buf_len experiences = [] indices = [] probs = [] # if self.__mode['PER']: if mem_len: #segment = self.__memory.total() / mem_len for i in range(mem_len): #s = random.uniform(segment * i, segment * (i + 1)) s = random.uniform(0, self.__memory.total()) idx, p, e = self.__memory.get(s) experiences.append(e) indices.append(idx) probs.append(p / self.__memory.total()) for e in self.__memory_buffer: # Add experience to the buffer and record its index experiences.append(e) #if self.__mode['PER']: idx = self.__memory.add(0.0, e) # Default value for p is 0 indices.append(idx) probs.append(1 / len(self)) self.__memory_buffer.clear() states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return states, actions, rewards, next_states, dones, indices, probs def update(self, indices, p_values): for idx, p in zip(indices, p_values): self.__memory.update(idx, p) def __len__(self): return max(len(self.__memory), len(self.__memory_buffer))
class PriorityBuffer: # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py def __init__(self, action_size, agent_config): """Initialize a PriorityBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only) beta_start (float): start of beta value for prioritised buffer beta_max_steps (int): max number of steps to reach beta value of 1. """ self.action_size = action_size self.tree = SumTree(capacity=agent_config.buffer_size) self.batch_size = agent_config.batch_size # self.seed = random.seed(buffer_config.seed) self.epsilon = agent_config.buffer_epsilon # how much randomness we require a = 0 (pure random) a = 1 (only priority) self.alpha = agent_config.alpha self.beta = agent_config.beta_start self.beta_start = agent_config.beta_start self.beta_end = agent_config.beta_end self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps def add(self, sample, error): """Add a new experience to memory.""" p = self._get_priority(error) state, action, reward, next_state, done = sample e = Experience(state, action, reward, next_state, done) self.tree.add(p, e) def _get_priority(self, error): return (abs(error) + self.epsilon) ** self.alpha def sample(self): experiences = [] idxs = [] segment = self.tree.total() / self.batch_size priorities = [] for i in range(self.batch_size): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) if isinstance(data, Experience): priorities.append(p) experiences.append(data) idxs.append(idx) else: print("WHAT THE HECK !!!") sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling]) return (states, actions, rewards, next_states, dones), idxs, is_weight def update(self, idx, error): # Not required in normal ReplayBuffer self.tree.update(idx, self._get_priority(error)) def __len__(self): """Return the current size of internal memory.""" return len(self.tree)
class PrioritisedReplayBuffer: def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon): self.action_size = action_size self.tree = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.alpha = alpha self.epsilon = epsilon def add(self, error, state, action, reward, next_state, done): e = self.experience(state, action, reward, next_state, done) p = self._get_priority(error) self.tree.add(p, e) def sample(self, beta): segment = self.tree.total( ) / self.batch_size # split into segments so we don't end up with duplicates innit experiences = [] priorities = [] idxs = [] for i in range(self.batch_size): start = segment * i end = segment * (i + 1) s = random.uniform(start, end) idx, p, e = self.tree.get(s) if e: priorities.append(p) experiences.append(e) idxs.append(idx) probs = priorities / self.tree.total() # big P weights = np.power(self.tree.n_entries * probs, -beta) weights /= weights.max() # scale so max weight is 1 states = torch.from_numpy(np.vstack([e.state for e in experiences ])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences ])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences ])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences ]).astype(np.uint8)).float().to(device) weights = torch.from_numpy(weights).float().to(device) return (states, actions, rewards, next_states, dones, weights, idxs) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def _get_priority(self, error): return (np.abs(error) + self.epsilon)**self.alpha def __len__(self): """Return the current size of internal memory.""" return self.tree.n_entries
class Memory(object): """ This SumTree code is modified version and the original code is from: https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py """ beta = MEMORY_BETA def __init__(self): self.limit = MEMORY_CAPACITY self.err_tree = SumTree(MEMORY_CAPACITY) self.action_shape = (0, MEMORY_ACTION_CNT) self.reward_shape = (0, MEMORY_REWARD_CNT) self.terminal_shape = self.action_shape self.observation_shape = (0, MEMORY_CRITIC_FEATURE_NUM) self.store_times = 0 self.Transition = namedtuple( 'Transition', ('state', 'action', 'reward', 'next_state', 'terminal')) def size(self): return self.limit if self.store_times > self.limit else self.store_times def sample(self, batch_size): idxes = np.empty(self.reward_shape, dtype=np.int32) isw = np.empty(self.reward_shape, dtype=np.float32) obs0 = np.empty(self.observation_shape, dtype=np.float32) obs1 = np.empty(self.observation_shape, dtype=np.float32) actions = np.empty(self.action_shape, dtype=np.float32) rewards = np.empty(self.reward_shape, dtype=np.float32) terminals = np.empty(self.terminal_shape, dtype=np.bool) nan_state = np.array([np.nan] * self.observation_shape[1]) self.beta = np.min([1., self.beta + MEMORY_BETA_INC_RATE]) # max = 1 max_td_err = np.max(self.err_tree.tree[-self.err_tree.capacity:]) idx_set = set() for i in range( batch_size * 2 ): # sample maximum batch_size * 2 times to get batch_size different instances v = np.random.uniform(0, self.err_tree.total_p) idx, td_err, trans = self.err_tree.get_leaf(v) if batch_size == len(idx_set): break if idx not in idx_set: idx_set.add(idx) else: continue if (trans.state == 0).all(): continue idxes = np.row_stack((idxes, np.array([idx]))) isw = np.row_stack((isw, np.array([ np.power( self._getPriority(td_err) / max_td_err, -self.beta) ]))) obs0 = np.row_stack((obs0, trans.state)) obs1 = np.row_stack( (obs1, nan_state if trans.terminal.all() else trans.next_state)) actions = np.row_stack((actions, trans.action)) rewards = np.row_stack((rewards, trans.reward)) terminals = np.row_stack((terminals, trans.terminal)) result = { 'obs0': array_min2d(obs0), 'actions': array_min2d(actions), 'rewards': array_min2d(rewards), 'obs1': array_min2d(obs1), 'terminals': array_min2d(terminals), } return idxes, result, isw def _getPriority(self, error): return (error + EPSILON)**MEMORY_ALPHA def append(self, obs0, action, reward, obs1, terminal, err, training=True): if not training: return trans = self.Transition(obs0, action, reward, obs1, terminal) self.err_tree.add(self._getPriority(err), trans) self.store_times += 1 def batch_update(self, tree_idx, errs): errs = np.abs(errs) + EPSILON # convert to abs and avoid 0 ps = np.power(errs, MEMORY_ALPHA) for ti, p in zip(tree_idx, ps): self.err_tree.update(ti, p[0]) @property def nb_entries(self): return self.store_times
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree e = 0.01 a = 0.6 beta = 0.4 beta_increment_per_sampling = 0.001 capacity = 100000 max_priority = 1 def __init__(self, host_name, db_name, collection_name): self.host_name = host_name self.db_name = db_name self.collection_name = collection_name self.client = MongoClient(host_name, 27017) self.db = self.client[db_name] self.replay_memory_collection = self.db[collection_name] self.sum_tree = SumTree(self.capacity) memory_priorities = self.replay_memory_collection.find({}, {"priority": 1}) for memory_priority in memory_priorities: self.sum_tree.add(memory_priority["priority"], {"_id": memory_priority["_id"]}) def retrieve_by_id(self, id): db_experiences = self.replay_memory_collection.find({"_id": id}) return { **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id": id } def _get_priority(self, error): return (error + self.e)**self.a def add(self, error, experience): p = self._get_priority(error) experience_to_save = {} experience_to_save["terminal"] = experience["terminal"] experience_to_save["action_index"] = experience["action_index"] experience_to_save["actual_reward"] = experience["actual_reward"] experience_to_save["priority"] = self.max_priority experience_to_save["binary"] = _pickle.dumps(experience) id = self.replay_memory_collection.insert(experience_to_save) self.sum_tree.add(p, {"_id": id}) def add_batch(self, experiences): for experience in experiences: self.add(self.max_priority, experience) def update(self, index, error, experience): p = self._get_priority(error) self.replay_memory_collection.update_one({"_id": experience["_id"]}, {"$set": { "priority": p }}) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.replay_memory_collection.count() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) experience = self.retrieve_by_id(data["_id"]) batch.append(experience) print( "action index: ", experience["action_index"], "reward: ", experience["actual_reward"], "priority: ", experience["priority"], ) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight
class PrioritizedReplayBuffer: """ Memory buffer responsible for Prioritized Experience Replay. This buffer stores up to memory_size experiences in a circular array-like data structure. Each experience is also associated with a probability weight. Batches may be sampled (with replacement) from this implied probability distribution in batches. The provided weights should be non-negative, but are not required to add up to 1. """ def __init__(self, device, memory_size, update_every=4, seed=0): """ Initializes the data structure :param device: (torch.device) Object representing the device where to allocate tensors :param memory_size: (int) Maximum capacity of memory buffer :param update_every: (int) Number of steps between update operations :param seed: (int) Seed used for PRNG """ self.device = device self.probability_weights = SumTree(capacity=memory_size, seed=seed) self.elements = deque(maxlen=memory_size) self.update_every = update_every self.step = 0 self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, state, action, reward, next_state, done): """ Adds a experience tuple (s, a, r, s', done) to memory :param state: (array-like) State value from experience tuple :param action: (int) Action value from experience tuple :param reward: (float) Reward value from experience tuple :param next_state: (array-like) Next state value from experience tuple :param done: (bool) Done flag from experience tuple """ e = self.experience(state, action, reward, next_state, done) self.elements.append(e) self.step += 1 # Add batch of experiences to memory, with max initial weight if self.step >= self.update_every: self.probability_weights.add(self.step) self.step = 0 def sample(self, batch_size, alpha, beta): """ Samples a batch of examples with replacement from the buffer. :param batch_size: (int) Number of samples to sample :param alpha: (float) PER probability hyperparameter :param beta: (float) PER probability hyperparameter :return: states: (list) States from sampled experiences actions: (list) Actions from sampled experiences rewards: (list) Rewards from sampled experiences next_states: (list) Next states from sampled experiences dones: (list) Done flags from sampled experiences indexes: (list) Indexes of sampled experiences """ indexes = self.probability_weights.sample(batch_size=batch_size, alpha=alpha, beta=beta) experiences = [self.elements[i] for i in indexes] # Copy experience tensors to device states = torch.from_numpy(np.vstack([e.state for e in experiences])).float().to(self.device) actions = torch.from_numpy(np.vstack([e.action for e in experiences])).long().to(self.device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences])).float().to(self.device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences])).float().to(self.device) dones = torch.from_numpy(np.vstack([e.done for e in experiences]).astype(np.uint8)).float().to(self.device) return states, actions, rewards, next_states, dones, indexes def update(self, indexes, weights): """ Updates the probability weights associated with the provided indexes. :param indexes: (array indexes) Indexes to have weights updated :param weights: (list) New weights for the provided indexes """ self.probability_weights.update(indexes, weights) def __len__(self): """Return the current size of internal memory.""" return len(self.probability_weights)
class PrioritizedReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, buffer_size, seed): """Initialize a ReplayBuffer object. Params ====== seed (int): random seed """ self.memory = SumTree(buffer_size) self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) # epsilon: small amount to avoid zero priority # alpha: [0~1] determines how much prioritization is used. with 0, we would get the uniform case # beta: Controls importance-sampling compensation. fully compensates for the non-uniform probabilities # when beta=1. The unbiased nature of the updates is most important near convergence at the end of # training, so we define a schedule on the exponent beta that starts from initial value and reaches 1 # only at the end of learning. self.epsilon = 0.01 self.alpha = 0.6 beta_start = 0.4 self.beta_end = 1.0 self.beta = beta_start beta_increments = 200 self.beta_increment = (self.beta_end - beta_start)/beta_increments def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" experience = self.experience(state, action, reward, next_state, done) p = self.memory.max_p() if p == 0: p = 1.0 self.memory.add(p=p, data=experience) def sample(self, n): """Randomly sample a batch of experiences from memory.""" experiences = [] indices = [] priorities = [] segment = self.memory.total_p() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, experience) = self.memory.get(s) experiences.append(experience) indices.append(idx) priorities.append(p) priorities = np.array(priorities, dtype=np.float64) indices = np.array(indices, dtype=np.int32) # print(f"priorities: {priorities}") probs = priorities / self.memory.total_p() # print(f"probs: {probs}") # importance-sampling (IS) weights w_is = (self.memory.capacity * probs) ** (-self.beta) # print(f"w_IS: {w_IS}") w_is_normalized = w_is/w_is.max() # print(f"w_IS_normalized: {w_IS_normalized}") # w_is_normalized = torch.from_numpy(w_is_normalized).float().to(self.device) return experiences, indices, w_is_normalized def update_errors(self, indices, errors): priorities = [self._to_priority(e) for e in errors] for (idx, p) in zip(indices, priorities): self.memory.update(idx, p) def _to_priority(self, error): return (error + self.epsilon) ** self.alpha def increase_beta(self): if self.beta < self.beta_end: self.beta = min(self.beta_end, self.beta + self.beta_increment) def __len__(self): return len(self.memory)
class PrioritizedExperienceReplayBuffer: """Fixed-size buffer to store experience tuples.""" alpha = 0.6 beta = 0.4 beta_increment_per_sample = 0.001 epsilon = 1e-6 def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) def compute_priority(self, td_error): return (td_error + self.epsilon) ** self.alpha def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" experience = self.experience(state, action, reward, next_state, done) max_priority = np.max(self.memory.tree[-self.memory.capacity:]) if max_priority == 0: max_priority = 1. self.memory.add(max_priority, experience) def update(self, index, td_error): priority = self.compute_priority(td_error) self.memory.update(index, priority) def sample(self): """ :return: importance weights, indices of sampled experiences, and sampled batch of experiences """ self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample) segment = self.memory.total() / self.batch_size indexes = [] priorities = [] experiences = [] for i in range(self.batch_size): # pick a segment a = segment * i b = segment * (i + 1) s = np.random.uniform(a, b) index, priority, experience = self.memory.get(s) indexes.append(index) priorities.append(priority) experiences.append(experience) sampling_probs = np.divide(priorities, self.memory.total()) # importance sampling i_s_weights = (self.batch_size * sampling_probs) ** -self.beta i_s_weights /= np.max(i_s_weights) states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) return i_s_weights, indexes, (states, actions, rewards, next_states, dones) def __len__(self): """Return the current size of internal memory.""" return self.memory.count
class ReplayMemory: def __init__(self, memory_size): self.memory_size = memory_size self.memory = SumTree(memory_size) self.epsilon = 0.0001 # small amount to avoid zero priority self.alpha = 0.6 # adj_pri = pri^alpha self.beta = 0.4 # importance-sampling, from initial value increasing to 1 self.beta_max = 1 self.beta_increment_per_sampling = 0.001 self.abs_err_upper = 1. # clipped td error def add(self, row): max_p = np.max( self.memory.tree[-self.memory.capacity:]) # max adj_pri of leaves if max_p == 0: max_p = self.abs_err_upper self.memory.add(max_p, row) # set the max adj_pri for new adj_pri def get_batch(self, batch_size): leaf_idx, batch_memory, ISWeights = np.empty( batch_size, dtype=np.int32), np.empty(batch_size, dtype=object), np.empty(batch_size) pri_seg = self.memory.total_p / batch_size # adj_pri segment self.beta = np.min( [self.beta_max, self.beta + self.beta_increment_per_sampling]) # max = 1 # Pi = Prob(i) = softmax(priority(i)) = adj_pri(i) / ∑_i(adj_pri(i)) # ISWeight = (N*Pj)^(-beta) / max_i[(N*Pi)^(-beta)] = (Pj / min_i[Pi])^(-beta) min_prob = np.min( self.memory.tree[self.memory.capacity - 1:self.memory.capacity - 1 + self.memory.counter]) / self.memory.total_p for i in range(batch_size): # sample from each interval a, b = pri_seg * i, pri_seg * (i + 1) # interval v = np.random.uniform(a, b) idx, p, data = self.memory.get_leaf(v) prob = p / self.memory.total_p ISWeights[i] = np.power(prob / min_prob, -self.beta) leaf_idx[i], batch_memory[i] = idx, data return leaf_idx, batch_memory, ISWeights def update_sum_tree(self, tree_idx, td_errors): for ti, td_error in zip(tree_idx, td_errors): p = self._calculate_priority(td_error) self.memory.update(ti, p) def _calculate_priority(self, td_error): priority = abs(td_error) + self.epsilon clipped_pri = np.minimum(priority, self.abs_err_upper) return np.power(clipped_pri, self.alpha) @property def length(self): return self.memory.counter def load_memory(self, memory): self.memory = memory def get_memory(self): return self.memory
class Memory(object): def __init__(self, capacity, state_size=37, epsilon=0.001, alpha=0.4, beta=0.3, beta_increment_per_sampling=0.001, abs_err_upper=1): self.tree = SumTree(capacity) self.epsilon = epsilon # Avoid 0 priority and hence a do not give a chance for the priority to be selected stochastically self.alpha = alpha # Vary priority vs randomness. alpha = 0 pure uniform randomnes. Alpha = 1, pure priority self.beta = beta # importance-weight-sampling, from small to big to give more importance to corrections done towards the end of the training self.beta_increment_per_sampling = 0.001 self.abs_err_upper = 1 # clipped abs error self.state_size = state_size # Save experience in memory def store(self, state, action, reward, next_state, done): transition = [state, action, reward, next_state, done] max_p = np.max(self.tree.tree[-self.tree.capacity:]) # In case of no priority, we set abs error to 1 if max_p == 0: max_p = self.abs_err_upper self.tree.add(max_p, transition) # set the max p for new p # Sample n amount of experiences using prioritized experience replay def sample(self, n): b_idx = np.empty((n, ), dtype=np.int32) states = np.empty((n, self.state_size)) actions = np.empty((n, )) rewards = np.empty((n, )) next_states = np.empty((n, self.state_size)) dones = np.empty((n, )) ISWeights = np.empty((n, )) # IS -> Importance Sampling pri_seg = self.tree.total_p / n # priority segment self.beta = np.min([ 1., self.beta + self.beta_increment_per_sampling ]) # Increase the importance of the sampling for ISWeights # min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculate ISweight for i in range(n): a, b = pri_seg * i, pri_seg * (i + 1) v = np.random.uniform(a, b) idx, p, data = self.tree.get_leaf(v) prob = p / self.tree.total_p ISWeights[i] = np.power(prob, -self.beta) b_idx[i] = idx states[i, :] = data[0] actions[i] = data[1] rewards[i] = data[2] next_states[i, :] = data[3] dones[i] = data[4] states = torch.from_numpy(np.vstack(states)).float().to(device) actions = torch.from_numpy(np.vstack(actions)).long().to(device) rewards = torch.from_numpy(np.vstack(rewards)).float().to(device) next_states = torch.from_numpy( np.vstack(next_states)).float().to(device) dones = torch.from_numpy(np.vstack(dones).astype( np.uint8)).float().to(device) ISWeights = torch.from_numpy(np.vstack(ISWeights)).float().to(device) return b_idx, states, actions, rewards, next_states, dones, ISWeights # Update the priorities according to the new errors def batch_update(self, tree_idx, abs_errors): abs_errors += self.epsilon # convert to abs and avoid 0 clipped_errors = np.minimum(abs_errors, self.abs_err_upper) ps = np.power(clipped_errors, self.alpha) for ti, p in zip(tree_idx, ps): self.tree.update(ti, p) def __len__(self): return self.tree.length()
class PrioritizeReplayBuffer(ReplayBuffer): """Prioritize experience replay.""" def __init__( self, buffer_size, batch_size, seed, beta_start=0.4, delta_beta=1e-5, alpha=0.6, eps=1e-8, ): """Initialize PER. Args: buffer_size (int): Size of replay buffer. The actual size will be the first power of 2 greater than buffer_size. batch_size (int): Size of batches to draw. seed (float): Seed. beta_start (float): Initial value for beta (importance sampling exponent) delta_beta (float): Beta increment at each time step. alpha (float): Priority exponent. eps (float): Small positive number to avoid unsampling 0 prioritized examples. """ # Depth of sum tree depth = int(math.log2(buffer_size)) + 1 super(PrioritizeReplayBuffer, self).__init__(2**depth, batch_size, seed) # Initialize sum tree to keep track of the sum of priorities self.priorities = SumTree(depth) # Current max priority self.max_p = 1.0 # PER Parameters self.alpha = alpha self.eps = eps self.beta = beta_start self.delta_beta = delta_beta def add(self, state, action, reward, next_state, done): """Add transition inside the Replay buffer.""" # Add in the sum tree with current max priority self.priorities.add(self.max_p, self.index) super().add(state, action, reward, next_state, done) def sample(self): """Get sample.""" # Get indices to sample from sum tree # Store these indices to compute importance sampling later self.last_indices = self.priorities.sample(self.batch_size) # Return transitions corresponding to this indices return [self.data[i] for i in self.last_indices] def update_priorities(self, td_error): """Update priorities.""" # Compute new priorites new_priorities = (abs(td_error) + self.eps)**self.alpha # Update sum tree self.priorities.update(self.last_indices, new_priorities) # Update the current max priority self.max_p = max(self.max_p, max(new_priorities)) def importance_sampling(self): """Compute importance sampling weights of last sample.""" # Get probabilities probs = self.priorities.get( self.last_indices) / self.priorities.total_sum # Compute weights weights = (len(self) * probs)**(-self.beta) weights /= max(weights) # Update beta self.beta = min(self.beta + self.delta_beta, 1) # Return weights return weights
class PrioritisedReplayBuffer(): """A prioritised replay buffer. Creates a sum tree and uses it to stores a fixed number of experience tuples. When sampled experiences are returned with greater priority given to those with the highest absolute TD-error. """ def __init__(self, buffer_size, alpha, beta_zero, beta_increment_size=0.001, epsilon=0.1, max_priority=1., seed=None): """Priority replay buffer initialiser. Args: buffer_size (int): capacity of the replay buffer. alpha (float): priority scaling hyperparameter. beta_zero (float): importance sampling scaling hyperparameter. beta_increment_size (float): beta annealing rate. epsilon (float): base priority to ensure non-zero sampling probability. max_priority (float): initial maximum priority. seed (int): seed for random number generator """ random.seed(seed) self.sum_tree = SumTree(buffer_size) self.memory = {} self.experience = namedtuple( "experience", ["state", "action", "reward", "next_state", "done"]) self.buffer_size = buffer_size self.beta_increment_size = beta_increment_size self.max_priority = max_priority**alpha self.min_priority = max_priority**alpha self.last_min_update = 0 self.alpha = alpha self.beta = beta_zero self.epsilon = epsilon def add(self, state, action, reward, next_state, done): """Creates experience tuple and adds it to the replay buffer.""" experience = self.experience(state, action, reward, next_state, done) current_tree_idx = self.sum_tree.input_pointer self.memory[current_tree_idx] = experience self.sum_tree.add(self.max_priority) def sample(self, batch_size): """Returns a batch of experiences sampled according to their priority.""" idx_list = [] weights = [] states = [] actions = [] rewards = [] next_states = [] done_list = [] segment = self.sum_tree.total() / batch_size sample_list = [ random.uniform(segment * i, segment * (i + 1)) for i in range(batch_size) ] max_weight = self.min_priority**(-self.beta) for s in sample_list: idx, priority = self.sum_tree.sample(s) idx_list.append(idx) weight = priority**(-self.beta) / max_weight weights.append(weight) sample = self.memory[idx] state, action, reward, next_state, done = sample states.append(state) actions.append(action) rewards.append(reward) next_states.append(next_state) done_list.append(done) return states, actions, rewards, next_states, done_list, idx_list, weights def update(self, idx_list, td_error): """Updates a specifics experience's priority.""" priority_list = (td_error + self.epsilon)**self.alpha self.max_priority = max(self.max_priority, priority_list.max()) list_min_priority = priority_list.min() if list_min_priority <= self.min_priority: self.min_priority = list_min_priority self.last_min_update = 0 else: self.last_min_update += 1 if self.last_min_update >= self.buffer_size: self.min_priority = np.array([ node.val for node in self.sum_tree.tree_array[-self.buffer_size:] ]).min() self.last_min_update = 0 for i, idx in enumerate(idx_list): priority = min(self.max_priority, priority_list[i]) self.sum_tree.update(idx, priority) self.beta = min(1, self.beta + self.beta_increment_size) def __len__(self, ): """Return number of experiences in the replay buffer.""" return len(self.memory)