class Memory: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, capacity, alpha=0.6, beta=0.4, beta_anneal_step=0.001, epsilon=0.00000001): tree_capacity = 1 while tree_capacity < size: tree_capacity *= 2 self.tree = SumTree(capacity) self.capacity = capacity self.a = alpha self.beta = beta self.beta_increment_per_sampling = beta_anneal_step self.e = epsilon def _get_priority(self, error): # Direct proportional prioritization return (np.abs(error) + self.e)**self.a def add(self, error, sample): p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] for i in range(n): a = segment * i b = segment * (i + 1) data = 0 while data == 0: s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def step(self): self.beta = np.min( [1. - self.e, self.beta + self.beta_increment_per_sampling]) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, e, a, beta, beta_increment_per_sampling, capacity, max_priority): self.capacity = capacity self.e = e self.a = a self.beta = beta self.beta_increment_per_sampling = beta_increment_per_sampling self.capacity = capacity self.max_priority = max_priority self.sum_tree = SumTree(self.capacity) def _get_priority(self, error): return min((self.max_priority, (error + self.e)**self.a)) def add(self, experience, error=None): p = self._get_priority(error) if error != None else self.max_priority self.sum_tree.add(p, experience) def add_batch(self, experiences): for experience in experiences: self.add(experience, self.max_priority) def update(self, index, error, experience): p = self._get_priority(error) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.sum_tree.getCount() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight
class Memory: # stored as ( s, a, r, s_ ) in SumTree e = 1e-10 a = 0.5 beta = 0.4 beta_increment_per_sampling = 0.001 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _get_priority(self, error): return (error + self.e)**self.a def append(self, data): error, sample = data p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / (self.tree.total() + 1e-10) is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def __len__(self): return self.tree.n_entries
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, buffer_size, alpha): self.capacity = buffer_size self.tree = SumTree(buffer_size) self.alpha = alpha self.max_priority = 1 #self.beta_initial = ?? #self.beta_steps = ?? def add(self, experience): self.tree.add(self.max_priority, experience) def update(self, index, experience, td_error): priority = (abs(td_error) + 0.0001)**self.alpha self.tree.update(index, priority) if self.max_priority < priority: self.max_priority = priority def sample(self, batch_size): indexes = [] batchs = [] total = self.tree.total() section = total / batch_size for i in range(batch_size): r = section * i + np.random.random() * section (idx, priority, experience) = self.tree.get(r) indexes.append(idx) # 後のpriority更新に使う batchs.append(experience) return (indexes, batchs)
class Memory: e = 0.01 a = 0.6 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _getPriority(self, error): return (error + self.e)**self.a def add(self, error, sample): p = self._getPriority(error) self.tree.add(p, sample) def sample(self, n): batch = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): p = self._getPriority(error) self.tree.update(idx, p)
class PriorityMemory: def __init__(self, capacity): """ Instantiate a priority based memory with capable of holding capacity experiences. Memories are sampled with frequency based on their priority. """ # Circular buffer array based tree with priorities as node values. self.tree = SumTree(capacity) self.e = 0.01 # Small constant to ensure all priorities > 0 self.a = 0.6 # Constant to control the weight of error on priority def _getPriority(self, error): """ Convert error to a priority based on the constants "e" and "a" """ return (error + self.e) ** self.a def add(self, experience, error): """ Add an experience to memory """ p = self._getPriority(error) self.tree.add(p, experience) def sample(self, n): """ Sample n experiences from memory. Experiences selection frequency is based on priority. Returns: - mini_batch: Sequence containing the experiences. - indicies: The index of the node associated with each experience so that its priority can be updated. """ mini_batch = [] indicies = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, _, experience) = self.tree.get(s) mini_batch.append(experience) indicies.append(idx) return mini_batch, indicies def update(self, idx, error): """ Update the priority associated with a memory. """ p = self._getPriority(error) self.tree.update(idx, p)
class PrioritizedMemory: e = 0.01 a = 0.6 beta = 0.4 beta_increment_per_sampling = 0.001 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _get_priority(self, error): return (np.abs(error) + self.e)**self.a def push(self, error, sample): p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
class PrioritizeReplayBuffer(ReplayBuffer): # Based on https://github.com/y-kamiya/machine-learning-samples/blob/7b6792ce37cc69051e9053afeddc6d485ad34e79/python3/reinforcement/dqn/agent.py EPSILON = 0.0001 ALPHA = 0.6 BETA = 0.4 size = 0 def __init__(self, capacity): super().__init__(capacity=capacity) self.td_error_epsilon = 0.0001 self.tree = SumTree(capacity) def __len__(self): return self.size def _getPriority(self, td_error): return (td_error + self.EPSILON)**self.ALPHA def push(self, state, action, done, next_state, reward, p_index): self.size += 1 transition = self.Transition(state, action, done, next_state, reward, p_index) priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, batch_size, episode): list = [] indexes = [] weights = np.empty(batch_size, dtype='float32') total = self.tree.total() beta = self.BETA + ( 1 - self.BETA) * episode #episode / self.config.num_episodes for i, rand in enumerate(np.random.uniform(0, total, batch_size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total)**(-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority)
class PERMemory: EPSILON = 0.0001 ALPHA = 0.5 BETA = 0.4 size = 0 def __init__(self, config, capacity): self.config = config self.capacity = capacity self.tree = SumTree(capacity) def _getPriority(self, td_error): return (td_error + self.EPSILON) ** self.ALPHA def push(self, transition): self.size += 1 priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, size, episode): list = [] indexes = [] weights = np.empty(size, dtype='float32') total = self.tree.total() beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes beta = min(1.0, beta) for i, rand in enumerate(np.random.uniform(0, total, size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total) ** (-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority) def __len__(self): return self.size
class Memory(object): e = 0.05 def __init__(self, capacity, pr_scale): self.capacity = capacity self.memory = ST(self.capacity) self.pr_scale = pr_scale self.max_pr = 0 def get_priority(self, error): return (error + self.e)**self.pr_scale def remember(self, sample, error): p = self.get_priority(error) self_max = max(self.max_pr, p) self.memory.add(self_max, sample) def sample(self, n): sample_batch = [] sample_batch_indices = [] sample_batch_priorities = [] num_segments = self.memory.total() / n for i in range(n): left = num_segments * i right = num_segments * (i + 1) s = random.uniform(left, right) idx, pr, data = self.memory.get(s) sample_batch.append((idx, data)) sample_batch_indices.append(idx) sample_batch_priorities.append(pr) return [sample_batch, sample_batch_indices, sample_batch_priorities] def update(self, batch_indices, errors): for i in range(len(batch_indices)): p = self.get_priority(errors[i]) self.memory.update(batch_indices[i], p)
class Replay_Memory: def __init__(self): global MEMORY_LEN self.tree = SumTree(MEMORY_LEN) def add(self, error, sample): global MEMORY_BIAS, MEMORY_POW priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.add(priority, sample) def sample(self): """ Get a sample batch of the replay memory Returns: batch: a batch with one sample from each segment of the memory """ global BATCH_SIZE batch = [] #we want one representative of all distribution-segments in the batch #e.g BATCH_SIZE=2: batch contains one sample from [min,median] #and from [median,max] segment = self.tree.total() / BATCH_SIZE for i in range(BATCH_SIZE): minimum = segment * i maximum = segment * (i + 1) s = random.uniform(minimum, maximum) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): """ Updates one entry in the replay memory Args: idx: the position of the outdated transition in the memory error: the newly calculated error """ priority = (error + MEMORY_BIAS)**MEMORY_POW self.tree.update(idx, priority)
class ReplayMemory(object): def __init__(self, max_size, alpha, eps): self.max_size = max_size self.alpha = alpha self.eps = eps self.tree = SumTree(max_size) self.last_idxs = None self.size = 0 def get_batch(self, batch_size): self.last_idxs = [] ret = [] for i in range(min(batch_size, self.size)): s = random.random() * self.tree.total() idx, _, data = self.tree.get(s) ret.append(pickle.loads(zlib.decompress(data))) self.last_idxs.append(idx) return ret def update(self, losses): for i in range(len(self.last_idxs)): self.tree.update(self.last_idxs[i], math.pow(losses[i] + self.eps, self.alpha)) def add_element(self, new_el, loss): self.size = min(self.max_size, self.size + 1) p = math.pow(loss + self.eps, self.alpha) self.tree.add(p, zlib.compress(pickle.dumps(new_el))) def __len__(self): return self.size
class ReplayBuffer: def __init__(self, params): buffer_size = params['buffer_size'] batch_size = params['batch_size'] mode = params['mode'] self.__buffer_size = buffer_size self.__batch_size = batch_size self.__mode = mode self.__experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.__memory = SumTree(buffer_size) self.__memory_buffer = [] def get_batch_size(self): return self.__batch_size def is_ready(self): return len(self) >= self.__batch_size def add(self, state, action, reward, next_state, done): self.__memory_buffer.append( self.__experience(state, action, reward, next_state, done)) def sample(self): buf_len = len(self.__memory_buffer) mem_len = self.__batch_size - buf_len experiences = [] indices = [] probs = [] # if self.__mode['PER']: if mem_len: #segment = self.__memory.total() / mem_len for i in range(mem_len): #s = random.uniform(segment * i, segment * (i + 1)) s = random.uniform(0, self.__memory.total()) idx, p, e = self.__memory.get(s) experiences.append(e) indices.append(idx) probs.append(p / self.__memory.total()) for e in self.__memory_buffer: # Add experience to the buffer and record its index experiences.append(e) #if self.__mode['PER']: idx = self.__memory.add(0.0, e) # Default value for p is 0 indices.append(idx) probs.append(1 / len(self)) self.__memory_buffer.clear() states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return states, actions, rewards, next_states, dones, indices, probs def update(self, indices, p_values): for idx, p in zip(indices, p_values): self.__memory.update(idx, p) def __len__(self): return max(len(self.__memory), len(self.__memory_buffer))
class PrioritisedReplayBuffer: def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon): self.action_size = action_size self.tree = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.alpha = alpha self.epsilon = epsilon def add(self, error, state, action, reward, next_state, done): e = self.experience(state, action, reward, next_state, done) p = self._get_priority(error) self.tree.add(p, e) def sample(self, beta): segment = self.tree.total( ) / self.batch_size # split into segments so we don't end up with duplicates innit experiences = [] priorities = [] idxs = [] for i in range(self.batch_size): start = segment * i end = segment * (i + 1) s = random.uniform(start, end) idx, p, e = self.tree.get(s) if e: priorities.append(p) experiences.append(e) idxs.append(idx) probs = priorities / self.tree.total() # big P weights = np.power(self.tree.n_entries * probs, -beta) weights /= weights.max() # scale so max weight is 1 states = torch.from_numpy(np.vstack([e.state for e in experiences ])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences ])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences ])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences ]).astype(np.uint8)).float().to(device) weights = torch.from_numpy(weights).float().to(device) return (states, actions, rewards, next_states, dones, weights, idxs) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def _get_priority(self, error): return (np.abs(error) + self.epsilon)**self.alpha def __len__(self): """Return the current size of internal memory.""" return self.tree.n_entries
class PrioritisedReplayBuffer(): """A prioritised replay buffer. Creates a sum tree and uses it to stores a fixed number of experience tuples. When sampled experiences are returned with greater priority given to those with the highest absolute TD-error. """ def __init__(self, buffer_size, alpha, beta_zero, beta_increment_size=0.001, epsilon=0.1, max_priority=1., seed=None): """Priority replay buffer initialiser. Args: buffer_size (int): capacity of the replay buffer. alpha (float): priority scaling hyperparameter. beta_zero (float): importance sampling scaling hyperparameter. beta_increment_size (float): beta annealing rate. epsilon (float): base priority to ensure non-zero sampling probability. max_priority (float): initial maximum priority. seed (int): seed for random number generator """ random.seed(seed) self.sum_tree = SumTree(buffer_size) self.memory = {} self.experience = namedtuple( "experience", ["state", "action", "reward", "next_state", "done"]) self.buffer_size = buffer_size self.beta_increment_size = beta_increment_size self.max_priority = max_priority**alpha self.min_priority = max_priority**alpha self.last_min_update = 0 self.alpha = alpha self.beta = beta_zero self.epsilon = epsilon def add(self, state, action, reward, next_state, done): """Creates experience tuple and adds it to the replay buffer.""" experience = self.experience(state, action, reward, next_state, done) current_tree_idx = self.sum_tree.input_pointer self.memory[current_tree_idx] = experience self.sum_tree.add(self.max_priority) def sample(self, batch_size): """Returns a batch of experiences sampled according to their priority.""" idx_list = [] weights = [] states = [] actions = [] rewards = [] next_states = [] done_list = [] segment = self.sum_tree.total() / batch_size sample_list = [ random.uniform(segment * i, segment * (i + 1)) for i in range(batch_size) ] max_weight = self.min_priority**(-self.beta) for s in sample_list: idx, priority = self.sum_tree.sample(s) idx_list.append(idx) weight = priority**(-self.beta) / max_weight weights.append(weight) sample = self.memory[idx] state, action, reward, next_state, done = sample states.append(state) actions.append(action) rewards.append(reward) next_states.append(next_state) done_list.append(done) return states, actions, rewards, next_states, done_list, idx_list, weights def update(self, idx_list, td_error): """Updates a specifics experience's priority.""" priority_list = (td_error + self.epsilon)**self.alpha self.max_priority = max(self.max_priority, priority_list.max()) list_min_priority = priority_list.min() if list_min_priority <= self.min_priority: self.min_priority = list_min_priority self.last_min_update = 0 else: self.last_min_update += 1 if self.last_min_update >= self.buffer_size: self.min_priority = np.array([ node.val for node in self.sum_tree.tree_array[-self.buffer_size:] ]).min() self.last_min_update = 0 for i, idx in enumerate(idx_list): priority = min(self.max_priority, priority_list[i]) self.sum_tree.update(idx, priority) self.beta = min(1, self.beta + self.beta_increment_size) def __len__(self, ): """Return number of experiences in the replay buffer.""" return len(self.memory)
class PriorityBuffer: # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py def __init__(self, action_size, agent_config): """Initialize a PriorityBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only) beta_start (float): start of beta value for prioritised buffer beta_max_steps (int): max number of steps to reach beta value of 1. """ self.action_size = action_size self.tree = SumTree(capacity=agent_config.buffer_size) self.batch_size = agent_config.batch_size # self.seed = random.seed(buffer_config.seed) self.epsilon = agent_config.buffer_epsilon # how much randomness we require a = 0 (pure random) a = 1 (only priority) self.alpha = agent_config.alpha self.beta = agent_config.beta_start self.beta_start = agent_config.beta_start self.beta_end = agent_config.beta_end self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps def add(self, sample, error): """Add a new experience to memory.""" p = self._get_priority(error) state, action, reward, next_state, done = sample e = Experience(state, action, reward, next_state, done) self.tree.add(p, e) def _get_priority(self, error): return (abs(error) + self.epsilon) ** self.alpha def sample(self): experiences = [] idxs = [] segment = self.tree.total() / self.batch_size priorities = [] for i in range(self.batch_size): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) if isinstance(data, Experience): priorities.append(p) experiences.append(data) idxs.append(idx) else: print("WHAT THE HECK !!!") sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling]) return (states, actions, rewards, next_states, dones), idxs, is_weight def update(self, idx, error): # Not required in normal ReplayBuffer self.tree.update(idx, self._get_priority(error)) def __len__(self): """Return the current size of internal memory.""" return len(self.tree)
class PrioritizedExperienceReplayBuffer: """Fixed-size buffer to store experience tuples.""" alpha = 0.6 beta = 0.4 beta_increment_per_sample = 0.001 epsilon = 1e-6 def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) def compute_priority(self, td_error): return (td_error + self.epsilon) ** self.alpha def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" experience = self.experience(state, action, reward, next_state, done) max_priority = np.max(self.memory.tree[-self.memory.capacity:]) if max_priority == 0: max_priority = 1. self.memory.add(max_priority, experience) def update(self, index, td_error): priority = self.compute_priority(td_error) self.memory.update(index, priority) def sample(self): """ :return: importance weights, indices of sampled experiences, and sampled batch of experiences """ self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample) segment = self.memory.total() / self.batch_size indexes = [] priorities = [] experiences = [] for i in range(self.batch_size): # pick a segment a = segment * i b = segment * (i + 1) s = np.random.uniform(a, b) index, priority, experience = self.memory.get(s) indexes.append(index) priorities.append(priority) experiences.append(experience) sampling_probs = np.divide(priorities, self.memory.total()) # importance sampling i_s_weights = (self.batch_size * sampling_probs) ** -self.beta i_s_weights /= np.max(i_s_weights) states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( device) return i_s_weights, indexes, (states, actions, rewards, next_states, dones) def __len__(self): """Return the current size of internal memory.""" return self.memory.count
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree e = 0.01 a = 0.6 beta = 0.4 beta_increment_per_sampling = 0.001 capacity = 100000 max_priority = 1 def __init__(self, host_name, db_name, collection_name): self.host_name = host_name self.db_name = db_name self.collection_name = collection_name self.client = MongoClient(host_name, 27017) self.db = self.client[db_name] self.replay_memory_collection = self.db[collection_name] self.sum_tree = SumTree(self.capacity) memory_priorities = self.replay_memory_collection.find({}, {"priority": 1}) for memory_priority in memory_priorities: self.sum_tree.add(memory_priority["priority"], {"_id": memory_priority["_id"]}) def retrieve_by_id(self, id): db_experiences = self.replay_memory_collection.find({"_id": id}) return { **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id": id } def _get_priority(self, error): return (error + self.e)**self.a def add(self, error, experience): p = self._get_priority(error) experience_to_save = {} experience_to_save["terminal"] = experience["terminal"] experience_to_save["action_index"] = experience["action_index"] experience_to_save["actual_reward"] = experience["actual_reward"] experience_to_save["priority"] = self.max_priority experience_to_save["binary"] = _pickle.dumps(experience) id = self.replay_memory_collection.insert(experience_to_save) self.sum_tree.add(p, {"_id": id}) def add_batch(self, experiences): for experience in experiences: self.add(self.max_priority, experience) def update(self, index, error, experience): p = self._get_priority(error) self.replay_memory_collection.update_one({"_id": experience["_id"]}, {"$set": { "priority": p }}) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.replay_memory_collection.count() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) experience = self.retrieve_by_id(data["_id"]) batch.append(experience) print( "action index: ", experience["action_index"], "reward: ", experience["actual_reward"], "priority: ", experience["priority"], ) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight