class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, buffer_size, alpha): self.capacity = buffer_size self.tree = SumTree(buffer_size) self.alpha = alpha self.max_priority = 1 #self.beta_initial = ?? #self.beta_steps = ?? def add(self, experience): self.tree.add(self.max_priority, experience) def update(self, index, experience, td_error): priority = (abs(td_error) + 0.0001)**self.alpha self.tree.update(index, priority) if self.max_priority < priority: self.max_priority = priority def sample(self, batch_size): indexes = [] batchs = [] total = self.tree.total() section = total / batch_size for i in range(batch_size): r = section * i + np.random.random() * section (idx, priority, experience) = self.tree.get(r) indexes.append(idx) # 後のpriority更新に使う batchs.append(experience) return (indexes, batchs)
class Memory: e = 0.01 a = 0.6 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _getPriority(self, error): return (error + self.e)**self.a def add(self, error, sample): p = self._getPriority(error) self.tree.add(p, sample) def sample(self, n): batch = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): p = self._getPriority(error) self.tree.update(idx, p)
class PrioritisedMemory(object): def __init__(self, alpha, beta, beta_end, epsilon, num_steps, replay_size): self.alpha = alpha self.beta_start = beta self.beta_end = beta_end self.beta = beta self.epsilon = epsilon self.num_steps = num_steps self.memory = SumTree(replay_size) self.replay_size = replay_size def proprotional_priority(self, td_error): return (np.abs(td_error) + self.epsilon)**self.alpha def add_memory(self, td_error, data): priority = self.proprotional_priority(td_error) self.memory.add_memory(data, priority) self.beta = np.min([ 1.0, self.beta + (self.beta_end - self.beta_start) / self.num_steps ]) def update_priority(self, index, td_error): new_priority = self.proprotional_priority(td_error) self.memory.update_priority(index, new_priority) def minibatch_sample(self, minibatch_size): samples = [] priorities = [] priority_indexes = [] interval = self.memory.priority_total() / minibatch_size for i in range(minibatch_size): sample = np.random.uniform(i * interval, (i + 1) * interval) priority_index, priority, data = self.memory.get(sample) samples.append(data) priorities.append(priority) priority_indexes.append(priority_index) sampling_probabilities = priorities / self.memory.priority_total() importance_weights = np.power( self.memory.replay_size * sampling_probabilities, -self.beta) importance_weights /= np.max(is_weight) return priority_indexes, samples, importance_weights
class PriorityMemory: def __init__(self, capacity): """ Instantiate a priority based memory with capable of holding capacity experiences. Memories are sampled with frequency based on their priority. """ # Circular buffer array based tree with priorities as node values. self.tree = SumTree(capacity) self.e = 0.01 # Small constant to ensure all priorities > 0 self.a = 0.6 # Constant to control the weight of error on priority def _getPriority(self, error): """ Convert error to a priority based on the constants "e" and "a" """ return (error + self.e) ** self.a def add(self, experience, error): """ Add an experience to memory """ p = self._getPriority(error) self.tree.add(p, experience) def sample(self, n): """ Sample n experiences from memory. Experiences selection frequency is based on priority. Returns: - mini_batch: Sequence containing the experiences. - indicies: The index of the node associated with each experience so that its priority can be updated. """ mini_batch = [] indicies = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, _, experience) = self.tree.get(s) mini_batch.append(experience) indicies.append(idx) return mini_batch, indicies def update(self, idx, error): """ Update the priority associated with a memory. """ p = self._getPriority(error) self.tree.update(idx, p)
class Memory: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, capacity, alpha=0.6, beta=0.4, beta_anneal_step=0.001, epsilon=0.00000001): tree_capacity = 1 while tree_capacity < size: tree_capacity *= 2 self.tree = SumTree(capacity) self.capacity = capacity self.a = alpha self.beta = beta self.beta_increment_per_sampling = beta_anneal_step self.e = epsilon def _get_priority(self, error): # Direct proportional prioritization return (np.abs(error) + self.e)**self.a def add(self, error, sample): p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] for i in range(n): a = segment * i b = segment * (i + 1) data = 0 while data == 0: s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def step(self): self.beta = np.min( [1. - self.e, self.beta + self.beta_increment_per_sampling]) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, e, a, beta, beta_increment_per_sampling, capacity, max_priority): self.capacity = capacity self.e = e self.a = a self.beta = beta self.beta_increment_per_sampling = beta_increment_per_sampling self.capacity = capacity self.max_priority = max_priority self.sum_tree = SumTree(self.capacity) def _get_priority(self, error): return min((self.max_priority, (error + self.e)**self.a)) def add(self, experience, error=None): p = self._get_priority(error) if error != None else self.max_priority self.sum_tree.add(p, experience) def add_batch(self, experiences): for experience in experiences: self.add(experience, self.max_priority) def update(self, index, error, experience): p = self._get_priority(error) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.sum_tree.getCount() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight
class Memory: # stored as ( s, a, r, s_ ) in SumTree e = 1e-10 a = 0.5 beta = 0.4 beta_increment_per_sampling = 0.001 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _get_priority(self, error): return (error + self.e)**self.a def append(self, data): error, sample = data p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / (self.tree.total() + 1e-10) is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def __len__(self): return self.tree.n_entries
class PrioritizedReplayMemory: def __init__(self, capacity, alpha=0.6, eps=1e-2): self.tree = SumTree(capacity) self.alpha = alpha # alpha determines how much prioritization is used self.eps = eps # epsilon smooths priority, priority = (TD_error + eps) ** alpha def _get_priority(self, td_error): return (td_error + self.eps) ** self.alpha def current_length(self): return self.tree.current_length() def total_sum(self): return self.tree.total_sum() def push(self, event, td_error): priority = self._get_priority(td_error) self.tree.insert(event, priority) def sample(self, batch_sz): batch = [] indices = [] priorities = [] segment = self.tree.total_sum() / batch_sz for i in range(batch_sz): l = segment * i r = segment * (i + 1) s = random.uniform(l, r) (idx, priority, data) = self.tree.get(s) batch.append(data) indices.append(idx) priorities.append(priority) samples = map(np.array, zip(*batch)) return samples, indices, priorities def update(self, idx, td_error): if isinstance(idx, list): for i in range(len(idx)): priority = self._get_priority(td_error[i]) self.tree.update(idx[i], priority) else: priority = self._get_priority(td_error) self.tree.update(idx, priority)
class PERMemory: EPSILON = 0.0001 ALPHA = 0.5 BETA = 0.4 size = 0 def __init__(self, config, capacity): self.config = config self.capacity = capacity self.tree = SumTree(capacity) def _getPriority(self, td_error): return (td_error + self.EPSILON) ** self.ALPHA def push(self, transition): self.size += 1 priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, size, episode): list = [] indexes = [] weights = np.empty(size, dtype='float32') total = self.tree.total() beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes beta = min(1.0, beta) for i, rand in enumerate(np.random.uniform(0, total, size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total) ** (-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority) def __len__(self): return self.size
class PrioritizeReplayBuffer(ReplayBuffer): # Based on https://github.com/y-kamiya/machine-learning-samples/blob/7b6792ce37cc69051e9053afeddc6d485ad34e79/python3/reinforcement/dqn/agent.py EPSILON = 0.0001 ALPHA = 0.6 BETA = 0.4 size = 0 def __init__(self, capacity): super().__init__(capacity=capacity) self.td_error_epsilon = 0.0001 self.tree = SumTree(capacity) def __len__(self): return self.size def _getPriority(self, td_error): return (td_error + self.EPSILON)**self.ALPHA def push(self, state, action, done, next_state, reward, p_index): self.size += 1 transition = self.Transition(state, action, done, next_state, reward, p_index) priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, batch_size, episode): list = [] indexes = [] weights = np.empty(batch_size, dtype='float32') total = self.tree.total() beta = self.BETA + ( 1 - self.BETA) * episode #episode / self.config.num_episodes for i, rand in enumerate(np.random.uniform(0, total, batch_size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total)**(-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority)
class PrioritizedMemory: e = 0.01 a = 0.6 beta = 0.4 beta_increment_per_sampling = 0.001 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _get_priority(self, error): return (np.abs(error) + self.e)**self.a def push(self, error, sample): p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
class Memory(object): e = 0.05 def __init__(self, capacity, pr_scale): self.capacity = capacity self.memory = ST(self.capacity) self.pr_scale = pr_scale self.max_pr = 0 def get_priority(self, error): return (error + self.e)**self.pr_scale def remember(self, sample, error): p = self.get_priority(error) self_max = max(self.max_pr, p) self.memory.add(self_max, sample) def sample(self, n): sample_batch = [] sample_batch_indices = [] sample_batch_priorities = [] num_segments = self.memory.total() / n for i in range(n): left = num_segments * i right = num_segments * (i + 1) s = random.uniform(left, right) idx, pr, data = self.memory.get(s) sample_batch.append((idx, data)) sample_batch_indices.append(idx) sample_batch_priorities.append(pr) return [sample_batch, sample_batch_indices, sample_batch_priorities] def update(self, batch_indices, errors): for i in range(len(batch_indices)): p = self.get_priority(errors[i]) self.memory.update(batch_indices[i], p)
class Replay_Memory: def __init__(self): global MEMORY_LEN self.tree = SumTree(MEMORY_LEN) def add(self, error, sample): global MEMORY_BIAS, MEMORY_POW priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.add(priority, sample) def sample(self): """ Get a sample batch of the replay memory Returns: batch: a batch with one sample from each segment of the memory """ global BATCH_SIZE batch = [] #we want one representative of all distribution-segments in the batch #e.g BATCH_SIZE=2: batch contains one sample from [min,median] #and from [median,max] segment = self.tree.total() / BATCH_SIZE for i in range(BATCH_SIZE): minimum = segment * i maximum = segment * (i + 1) s = random.uniform(minimum, maximum) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): """ Updates one entry in the replay memory Args: idx: the position of the outdated transition in the memory error: the newly calculated error """ priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.update(idx, priority)
class ReplayMemory(object): def __init__(self, max_size, alpha, eps): self.max_size = max_size self.alpha = alpha self.eps = eps self.tree = SumTree(max_size) self.last_idxs = None self.size = 0 def get_batch(self, batch_size): self.last_idxs = [] ret = [] for i in range(min(batch_size, self.size)): s = random.random() * self.tree.total() idx, _, data = self.tree.get(s) ret.append(pickle.loads(zlib.decompress(data))) self.last_idxs.append(idx) return ret def update(self, losses): for i in range(len(self.last_idxs)): self.tree.update(self.last_idxs[i], math.pow(losses[i] + self.eps, self.alpha)) def add_element(self, new_el, loss): self.size = min(self.max_size, self.size + 1) p = math.pow(loss + self.eps, self.alpha) self.tree.add(p, zlib.compress(pickle.dumps(new_el))) def __len__(self): return self.size
class PrioritisedReplayBuffer: def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon): self.action_size = action_size self.tree = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.alpha = alpha self.epsilon = epsilon def add(self, error, state, action, reward, next_state, done): e = self.experience(state, action, reward, next_state, done) p = self._get_priority(error) self.tree.add(p, e) def sample(self, beta): segment = self.tree.total( ) / self.batch_size # split into segments so we don't end up with duplicates innit experiences = [] priorities = [] idxs = [] for i in range(self.batch_size): start = segment * i end = segment * (i + 1) s = random.uniform(start, end) idx, p, e = self.tree.get(s) if e: priorities.append(p) experiences.append(e) idxs.append(idx) probs = priorities / self.tree.total() # big P weights = np.power(self.tree.n_entries * probs, -beta) weights /= weights.max() # scale so max weight is 1 states = torch.from_numpy(np.vstack([e.state for e in experiences ])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences ])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences ])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences ]).astype(np.uint8)).float().to(device) weights = torch.from_numpy(weights).float().to(device) return (states, actions, rewards, next_states, dones, weights, idxs) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def _get_priority(self, error): return (np.abs(error) + self.epsilon)**self.alpha def __len__(self): """Return the current size of internal memory.""" return self.tree.n_entries
class PriorityBuffer: # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py def __init__(self, action_size, agent_config): """Initialize a PriorityBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only) beta_start (float): start of beta value for prioritised buffer beta_max_steps (int): max number of steps to reach beta value of 1. """ self.action_size = action_size self.tree = SumTree(capacity=agent_config.buffer_size) self.batch_size = agent_config.batch_size # self.seed = random.seed(buffer_config.seed) self.epsilon = agent_config.buffer_epsilon # how much randomness we require a = 0 (pure random) a = 1 (only priority) self.alpha = agent_config.alpha self.beta = agent_config.beta_start self.beta_start = agent_config.beta_start self.beta_end = agent_config.beta_end self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps def add(self, sample, error): """Add a new experience to memory.""" p = self._get_priority(error) state, action, reward, next_state, done = sample e = Experience(state, action, reward, next_state, done) self.tree.add(p, e) def _get_priority(self, error): return (abs(error) + self.epsilon) ** self.alpha def sample(self): experiences = [] idxs = [] segment = self.tree.total() / self.batch_size priorities = [] for i in range(self.batch_size): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) if isinstance(data, Experience): priorities.append(p) experiences.append(data) idxs.append(idx) else: print("WHAT THE HECK !!!") sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling]) return (states, actions, rewards, next_states, dones), idxs, is_weight def update(self, idx, error): # Not required in normal ReplayBuffer self.tree.update(idx, self._get_priority(error)) def __len__(self): """Return the current size of internal memory.""" return len(self.tree)
class ReplayBuffer: def __init__(self, params): buffer_size = params['buffer_size'] batch_size = params['batch_size'] mode = params['mode'] self.__buffer_size = buffer_size self.__batch_size = batch_size self.__mode = mode self.__experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.__memory = SumTree(buffer_size) self.__memory_buffer = [] def get_batch_size(self): return self.__batch_size def is_ready(self): return len(self) >= self.__batch_size def add(self, state, action, reward, next_state, done): self.__memory_buffer.append( self.__experience(state, action, reward, next_state, done)) def sample(self): buf_len = len(self.__memory_buffer) mem_len = self.__batch_size - buf_len experiences = [] indices = [] probs = [] # if self.__mode['PER']: if mem_len: #segment = self.__memory.total() / mem_len for i in range(mem_len): #s = random.uniform(segment * i, segment * (i + 1)) s = random.uniform(0, self.__memory.total()) idx, p, e = self.__memory.get(s) experiences.append(e) indices.append(idx) probs.append(p / self.__memory.total()) for e in self.__memory_buffer: # Add experience to the buffer and record its index experiences.append(e) #if self.__mode['PER']: idx = self.__memory.add(0.0, e) # Default value for p is 0 indices.append(idx) probs.append(1 / len(self)) self.__memory_buffer.clear() states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return states, actions, rewards, next_states, dones, indices, probs def update(self, indices, p_values): for idx, p in zip(indices, p_values): self.__memory.update(idx, p) def __len__(self): return max(len(self.__memory), len(self.__memory_buffer))
class PrioritizeReplayBuffer(ReplayBuffer): """Prioritize experience replay.""" def __init__( self, buffer_size, batch_size, seed, beta_start=0.4, delta_beta=1e-5, alpha=0.6, eps=1e-8, ): """Initialize PER. Args: buffer_size (int): Size of replay buffer. The actual size will be the first power of 2 greater than buffer_size. batch_size (int): Size of batches to draw. seed (float): Seed. beta_start (float): Initial value for beta (importance sampling exponent) delta_beta (float): Beta increment at each time step. alpha (float): Priority exponent. eps (float): Small positive number to avoid unsampling 0 prioritized examples. """ # Depth of sum tree depth = int(math.log2(buffer_size)) + 1 super(PrioritizeReplayBuffer, self).__init__(2**depth, batch_size, seed) # Initialize sum tree to keep track of the sum of priorities self.priorities = SumTree(depth) # Current max priority self.max_p = 1.0 # PER Parameters self.alpha = alpha self.eps = eps self.beta = beta_start self.delta_beta = delta_beta def add(self, state, action, reward, next_state, done): """Add transition inside the Replay buffer.""" # Add in the sum tree with current max priority self.priorities.add(self.max_p, self.index) super().add(state, action, reward, next_state, done) def sample(self): """Get sample.""" # Get indices to sample from sum tree # Store these indices to compute importance sampling later self.last_indices = self.priorities.sample(self.batch_size) # Return transitions corresponding to this indices return [self.data[i] for i in self.last_indices] def update_priorities(self, td_error): """Update priorities.""" # Compute new priorites new_priorities = (abs(td_error) + self.eps)**self.alpha # Update sum tree self.priorities.update(self.last_indices, new_priorities) # Update the current max priority self.max_p = max(self.max_p, max(new_priorities)) def importance_sampling(self): """Compute importance sampling weights of last sample.""" # Get probabilities probs = self.priorities.get( self.last_indices) / self.priorities.total_sum # Compute weights weights = (len(self) * probs)**(-self.beta) weights /= max(weights) # Update beta self.beta = min(self.beta + self.delta_beta, 1) # Return weights return weights
class PrioritizedExperienceReplayBuffer: """Fixed-size buffer to store experience tuples.""" alpha = 0.6 beta = 0.4 beta_increment_per_sample = 0.001 epsilon = 1e-6 def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) def compute_priority(self, td_error): return (td_error + self.epsilon) ** self.alpha def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" experience = self.experience(state, action, reward, next_state, done) max_priority = np.max(self.memory.tree[-self.memory.capacity:]) if max_priority == 0: max_priority = 1. self.memory.add(max_priority, experience) def update(self, index, td_error): priority = self.compute_priority(td_error) self.memory.update(index, priority) def sample(self): """ :return: importance weights, indices of sampled experiences, and sampled batch of experiences """ self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample) segment = self.memory.total() / self.batch_size indexes = [] priorities = [] experiences = [] for i in range(self.batch_size): # pick a segment a = segment * i b = segment * (i + 1) s = np.random.uniform(a, b) index, priority, experience = self.memory.get(s) indexes.append(index) priorities.append(priority) experiences.append(experience) sampling_probs = np.divide(priorities, self.memory.total()) # importance sampling i_s_weights = (self.batch_size * sampling_probs) ** -self.beta i_s_weights /= np.max(i_s_weights) states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) return i_s_weights, indexes, (states, actions, rewards, next_states, dones) def __len__(self): """Return the current size of internal memory.""" return self.memory.count
class PrioritizedReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, buffer_size, seed): """Initialize a ReplayBuffer object. Params ====== seed (int): random seed """ self.memory = SumTree(buffer_size) self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) # epsilon: small amount to avoid zero priority # alpha: [0~1] determines how much prioritization is used. with 0, we would get the uniform case # beta: Controls importance-sampling compensation. fully compensates for the non-uniform probabilities # when beta=1. The unbiased nature of the updates is most important near convergence at the end of # training, so we define a schedule on the exponent beta that starts from initial value and reaches 1 # only at the end of learning. self.epsilon = 0.01 self.alpha = 0.6 beta_start = 0.4 self.beta_end = 1.0 self.beta = beta_start beta_increments = 200 self.beta_increment = (self.beta_end - beta_start)/beta_increments def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" experience = self.experience(state, action, reward, next_state, done) p = self.memory.max_p() if p == 0: p = 1.0 self.memory.add(p=p, data=experience) def sample(self, n): """Randomly sample a batch of experiences from memory.""" experiences = [] indices = [] priorities = [] segment = self.memory.total_p() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, experience) = self.memory.get(s) experiences.append(experience) indices.append(idx) priorities.append(p) priorities = np.array(priorities, dtype=np.float64) indices = np.array(indices, dtype=np.int32) # print(f"priorities: {priorities}") probs = priorities / self.memory.total_p() # print(f"probs: {probs}") # importance-sampling (IS) weights w_is = (self.memory.capacity * probs) ** (-self.beta) # print(f"w_IS: {w_IS}") w_is_normalized = w_is/w_is.max() # print(f"w_IS_normalized: {w_IS_normalized}") # w_is_normalized = torch.from_numpy(w_is_normalized).float().to(self.device) return experiences, indices, w_is_normalized def update_errors(self, indices, errors): priorities = [self._to_priority(e) for e in errors] for (idx, p) in zip(indices, priorities): self.memory.update(idx, p) def _to_priority(self, error): return (error + self.epsilon) ** self.alpha def increase_beta(self): if self.beta < self.beta_end: self.beta = min(self.beta_end, self.beta + self.beta_increment) def __len__(self): return len(self.memory)
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree e = 0.01 a = 0.6 beta = 0.4 beta_increment_per_sampling = 0.001 capacity = 100000 max_priority = 1 def __init__(self, host_name, db_name, collection_name): self.host_name = host_name self.db_name = db_name self.collection_name = collection_name self.client = MongoClient(host_name, 27017) self.db = self.client[db_name] self.replay_memory_collection = self.db[collection_name] self.sum_tree = SumTree(self.capacity) memory_priorities = self.replay_memory_collection.find({}, {"priority": 1}) for memory_priority in memory_priorities: self.sum_tree.add(memory_priority["priority"], {"_id": memory_priority["_id"]}) def retrieve_by_id(self, id): db_experiences = self.replay_memory_collection.find({"_id": id}) return { **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id": id } def _get_priority(self, error): return (error + self.e)**self.a def add(self, error, experience): p = self._get_priority(error) experience_to_save = {} experience_to_save["terminal"] = experience["terminal"] experience_to_save["action_index"] = experience["action_index"] experience_to_save["actual_reward"] = experience["actual_reward"] experience_to_save["priority"] = self.max_priority experience_to_save["binary"] = _pickle.dumps(experience) id = self.replay_memory_collection.insert(experience_to_save) self.sum_tree.add(p, {"_id": id}) def add_batch(self, experiences): for experience in experiences: self.add(self.max_priority, experience) def update(self, index, error, experience): p = self._get_priority(error) self.replay_memory_collection.update_one({"_id": experience["_id"]}, {"$set": { "priority": p }}) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.replay_memory_collection.count() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) experience = self.retrieve_by_id(data["_id"]) batch.append(experience) print( "action index: ", experience["action_index"], "reward: ", experience["actual_reward"], "priority: ", experience["priority"], ) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight