def __init__(self, buffer_size, alpha, beta_zero, beta_increment_size=0.001, epsilon=0.1, max_priority=1., seed=None): """Priority replay buffer initialiser. Args: buffer_size (int): capacity of the replay buffer. alpha (float): priority scaling hyperparameter. beta_zero (float): importance sampling scaling hyperparameter. beta_increment_size (float): beta annealing rate. epsilon (float): base priority to ensure non-zero sampling probability. max_priority (float): initial maximum priority. seed (int): seed for random number generator """ random.seed(seed) self.sum_tree = SumTree(buffer_size) self.memory = {} self.experience = namedtuple( "experience", ["state", "action", "reward", "next_state", "done"]) self.buffer_size = buffer_size self.beta_increment_size = beta_increment_size self.max_priority = max_priority**alpha self.min_priority = max_priority**alpha self.last_min_update = 0 self.alpha = alpha self.beta = beta_zero self.epsilon = epsilon
def __init__(self, batch_size, max_size, beta): self.batch_size = batch_size # mini batch大小 self.max_size = 2**math.floor( math.log2(max_size)) # 保证 sum tree 为完全二叉树 self.beta = beta self._sum_tree = SumTree(max_size)
def __init__(self, buffer_size, seed): """Initialize a ReplayBuffer object. Params ====== seed (int): random seed """ self.memory = SumTree(buffer_size) self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed) # epsilon: small amount to avoid zero priority # alpha: [0~1] determines how much prioritization is used. with 0, we would get the uniform case # beta: Controls importance-sampling compensation. fully compensates for the non-uniform probabilities # when beta=1. The unbiased nature of the updates is most important near convergence at the end of # training, so we define a schedule on the exponent beta that starts from initial value and reaches 1 # only at the end of learning. self.epsilon = 0.01 self.alpha = 0.6 beta_start = 0.4 self.beta_end = 1.0 self.beta = beta_start beta_increments = 200 self.beta_increment = (self.beta_end - beta_start)/beta_increments
def __init__(self, max_size, alpha, eps): self.max_size = max_size self.alpha = alpha self.eps = eps self.tree = SumTree(max_size) self.last_idxs = None self.size = 0
def __init__(self, capacity, batch_size): self.capacity = capacity self.batch_size = batch_size self.tree = SumTree(capacity=capacity) self.alpha = 0.6 self.beta = 0.4 self.p_epsilon = 1e-4 self.batch_size = 50
def __init__(self, memory_size): self.memory_size = memory_size self.memory = SumTree(memory_size) self.epsilon = 0.0001 # small amount to avoid zero priority self.alpha = 0.6 # adj_pri = pri^alpha self.beta = 0.4 # importance-sampling, from initial value increasing to 1 self.beta_max = 1 self.beta_increment_per_sampling = 0.001 self.abs_err_upper = 1. # clipped td error
def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon): self.action_size = action_size self.tree = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.alpha = alpha self.epsilon = epsilon
def __init__(self, capacity): """ Instantiate a priority based memory with capable of holding capacity experiences. Memories are sampled with frequency based on their priority. """ # Circular buffer array based tree with priorities as node values. self.tree = SumTree(capacity) self.e = 0.01 # Small constant to ensure all priorities > 0 self.a = 0.6 # Constant to control the weight of error on priority
def __init__(self, e, a, beta, beta_increment_per_sampling, capacity, max_priority): self.capacity = capacity self.e = e self.a = a self.beta = beta self.beta_increment_per_sampling = beta_increment_per_sampling self.capacity = capacity self.max_priority = max_priority self.sum_tree = SumTree(self.capacity)
def __init__(self): self.limit = MEMORY_CAPACITY self.err_tree = SumTree(MEMORY_CAPACITY) self.action_shape = (0, MEMORY_ACTION_CNT) self.reward_shape = (0, MEMORY_REWARD_CNT) self.terminal_shape = self.action_shape self.observation_shape = (0, MEMORY_CRITIC_FEATURE_NUM) self.store_times = 0 self.Transition = namedtuple( 'Transition', ('state', 'action', 'reward', 'next_state', 'terminal'))
def __init__(self, alpha, beta, beta_end, epsilon, num_steps, replay_size): self.alpha = alpha self.beta_start = beta self.beta_end = beta_end self.beta = beta self.epsilon = epsilon self.num_steps = num_steps self.memory = SumTree(replay_size) self.replay_size = replay_size
def __init__(self, tree_memory_length, error_multiplier=0.01, alpha=0.6, beta=0.4, beta_increment_per_sample=0.001): self.tree = SumTree(tree_memory_length) self.tree_memory_length = tree_memory_length self.error_multiplier = error_multiplier self.per_alpha = alpha self.per_beta_init = beta self.beta_increment_per_sample = beta_increment_per_sample
def __init__(self, capacity, alpha=0.6, beta=0.4, beta_anneal_step=0.001, epsilon=0.00000001): self.tree = SumTree(capacity) self.capacity = capacity self.a = alpha self.beta = beta self.beta_increment_per_sampling = beta_anneal_step self.e = epsilon
def __init__(self, host_name, db_name, collection_name): self.host_name = host_name self.db_name = db_name self.collection_name = collection_name self.client = MongoClient(host_name, 27017) self.db = self.client[db_name] self.replay_memory_collection = self.db[collection_name] self.sum_tree = SumTree(self.capacity) memory_priorities = self.replay_memory_collection.find({}, {"priority": 1}) for memory_priority in memory_priorities: self.sum_tree.add(memory_priority["priority"], {"_id": memory_priority["_id"]})
def test_len(self): instance = SumTree(4) instance.add(p=1, data=1) self.assertEqual(len(instance), 1) instance.add(p=2, data=2) self.assertEqual(len(instance), 2) instance.add(p=3, data=3) instance.add(p=4, data=4) instance.add(p=5, data=5) self.assertEqual(len(instance), 4)
def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = SumTree(buffer_size) self.batch_size = batch_size self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(seed)
def __init__(self, params): buffer_size = params['buffer_size'] batch_size = params['batch_size'] mode = params['mode'] self.__buffer_size = buffer_size self.__batch_size = batch_size self.__mode = mode self.__experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.__memory = SumTree(buffer_size) self.__memory_buffer = []
def __init__(self, device, memory_size, update_every=4, seed=0): """ Initializes the data structure :param device: (torch.device) Object representing the device where to allocate tensors :param memory_size: (int) Maximum capacity of memory buffer :param update_every: (int) Number of steps between update operations :param seed: (int) Seed used for PRNG """ self.device = device self.probability_weights = SumTree(capacity=memory_size, seed=seed) self.elements = deque(maxlen=memory_size) self.update_every = update_every self.step = 0 self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
def __init__(self, capacity, state_size=37, epsilon=0.001, alpha=0.4, beta=0.3, beta_increment_per_sampling=0.001, abs_err_upper=1): self.tree = SumTree(capacity) self.epsilon = epsilon # Avoid 0 priority and hence a do not give a chance for the priority to be selected stochastically self.alpha = alpha # Vary priority vs randomness. alpha = 0 pure uniform randomnes. Alpha = 1, pure priority self.beta = beta # importance-weight-sampling, from small to big to give more importance to corrections done towards the end of the training self.beta_increment_per_sampling = 0.001 self.abs_err_upper = 1 # clipped abs error self.state_size = state_size
class PrioritizedReplayMemory: def __init__(self, capacity, alpha=0.6, eps=1e-2): self.tree = SumTree(capacity) self.alpha = alpha # alpha determines how much prioritization is used self.eps = eps # epsilon smooths priority, priority = (TD_error + eps) ** alpha def _get_priority(self, td_error): return (td_error + self.eps) ** self.alpha def current_length(self): return self.tree.current_length() def total_sum(self): return self.tree.total_sum() def push(self, event, td_error): priority = self._get_priority(td_error) self.tree.insert(event, priority) def sample(self, batch_sz): batch = [] indices = [] priorities = [] segment = self.tree.total_sum() / batch_sz for i in range(batch_sz): l = segment * i r = segment * (i + 1) s = random.uniform(l, r) (idx, priority, data) = self.tree.get(s) batch.append(data) indices.append(idx) priorities.append(priority) samples = map(np.array, zip(*batch)) return samples, indices, priorities def update(self, idx, td_error): if isinstance(idx, list): for i in range(len(idx)): priority = self._get_priority(td_error[i]) self.tree.update(idx[i], priority) else: priority = self._get_priority(td_error) self.tree.update(idx, priority)
class MemoryDB: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, e, a, beta, beta_increment_per_sampling, capacity, max_priority): self.capacity = capacity self.e = e self.a = a self.beta = beta self.beta_increment_per_sampling = beta_increment_per_sampling self.capacity = capacity self.max_priority = max_priority self.sum_tree = SumTree(self.capacity) def _get_priority(self, error): return min((self.max_priority, (error + self.e)**self.a)) def add(self, experience, error=None): p = self._get_priority(error) if error != None else self.max_priority self.sum_tree.add(p, experience) def add_batch(self, experiences): for experience in experiences: self.add(experience, self.max_priority) def update(self, index, error, experience): p = self._get_priority(error) self.sum_tree.update(index, p) def update_batch(self, indexes, errors, experiences): for index, error, experience in zip(indexes, errors, experiences): self.update(index, error, experience) def get_experiences_size(self): return self.sum_tree.getCount() def sample(self, n): batch = [] idxs = [] segment = self.sum_tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.sum_tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.sum_tree.total() is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight
class Memory(object): def __init__(self, batch_size, max_size, beta): self.batch_size = batch_size # mini batch大小 self.max_size = 2**math.floor(math.log2(max_size)) # 保证 sum tree 为完全二叉树 self.beta = beta self._sum_tree = SumTree(max_size) def store_transition(self, s, a, r, s_, done): self._sum_tree.add((s, a, r, s_, done)) def get_mini_batches(self): n_sample = self.batch_size if self._sum_tree.size >= self.batch_size else self._sum_tree.size total = self._sum_tree.get_total() step = total // n_sample points_transitions_probs = [] for i in range(n_sample): v = np.random.uniform(i * step, (i + 1) * step - 1) t = self._sum_tree.sample(v) points_transitions_probs.append(t) points, transitions, probs = zip(*points_transitions_probs) # 计算重要性比率 max_impmortance_ratio = (n_sample * self._sum_tree.get_min())**-self.beta importance_ratio = [(n_sample * probs[i])**-self.beta / max_impmortance_ratio for i in range(len(probs))] return points, tuple(np.array(e) for e in zip(*transitions)), importance_ratio def update(self, points, td_error): for i in range(len(points)): self._sum_tree.update(points[i], td_error[i])
class PrioritisedMemory(object): def __init__(self, alpha, beta, beta_end, epsilon, num_steps, replay_size): self.alpha = alpha self.beta_start = beta self.beta_end = beta_end self.beta = beta self.epsilon = epsilon self.num_steps = num_steps self.memory = SumTree(replay_size) self.replay_size = replay_size def proprotional_priority(self, td_error): return (np.abs(td_error) + self.epsilon)**self.alpha def add_memory(self, td_error, data): priority = self.proprotional_priority(td_error) self.memory.add_memory(data, priority) self.beta = np.min([ 1.0, self.beta + (self.beta_end - self.beta_start) / self.num_steps ]) def update_priority(self, index, td_error): new_priority = self.proprotional_priority(td_error) self.memory.update_priority(index, new_priority) def minibatch_sample(self, minibatch_size): samples = [] priorities = [] priority_indexes = [] interval = self.memory.priority_total() / minibatch_size for i in range(minibatch_size): sample = np.random.uniform(i * interval, (i + 1) * interval) priority_index, priority, data = self.memory.get(sample) samples.append(data) priorities.append(priority) priority_indexes.append(priority_index) sampling_probabilities = priorities / self.memory.priority_total() importance_weights = np.power( self.memory.replay_size * sampling_probabilities, -self.beta) importance_weights /= np.max(is_weight) return priority_indexes, samples, importance_weights
class Memory: # stored as ( s, a, r, s_ ) in SumTree def __init__(self, capacity, alpha=0.6, beta=0.4, beta_anneal_step=0.001, epsilon=0.00000001): tree_capacity = 1 while tree_capacity < size: tree_capacity *= 2 self.tree = SumTree(capacity) self.capacity = capacity self.a = alpha self.beta = beta self.beta_increment_per_sampling = beta_anneal_step self.e = epsilon def _get_priority(self, error): # Direct proportional prioritization return (np.abs(error) + self.e)**self.a def add(self, error, sample): p = self._get_priority(error) self.tree.add(p, sample) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] for i in range(n): a = segment * i b = segment * (i + 1) data = 0 while data == 0: s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def step(self): self.beta = np.min( [1. - self.e, self.beta + self.beta_increment_per_sampling]) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
def __init__(self, observation_len: int, action_len: int, reward_len: int, capacity: int, alpha: int = 0.6): super(PriorityBuffer, self).__init__(observation_len, action_len, reward_len, capacity) self.sum_tree = SumTree(capacity) self.max_priority = alpha self.min_priority = alpha self.alpha = alpha
def load(self, lst_serializable): """ Load pickable representation of Replay Buffer. Inverse function of serializable """ super().load(lst_serializable[0]) self.max_priority = lst_serializable[1][0] self.min_priority = lst_serializable[1][1] self.alpha = lst_serializable[1][2] capacity = lst_serializable[1][3] tree_index = range(capacity) self.sum_tree = SumTree(capacity) self.sum_tree.update_values(tree_index, lst_serializable[1][4])
class Memory: e = 0.01 a = 0.6 def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity def _getPriority(self, error): return (error + self.e)**self.a def add(self, error, sample): p = self._getPriority(error) self.tree.add(p, sample) def sample(self, n): batch = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) batch.append((idx, data)) return batch def update(self, idx, error): p = self._getPriority(error) self.tree.update(idx, p)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, buffer_size, alpha): self.capacity = buffer_size self.tree = SumTree(buffer_size) self.alpha = alpha self.max_priority = 1 #self.beta_initial = ?? #self.beta_steps = ?? def add(self, experience): self.tree.add(self.max_priority, experience) def update(self, index, experience, td_error): priority = (abs(td_error) + 0.0001)**self.alpha self.tree.update(index, priority) if self.max_priority < priority: self.max_priority = priority def sample(self, batch_size): indexes = [] batchs = [] total = self.tree.total() section = total / batch_size for i in range(batch_size): r = section * i + np.random.random() * section (idx, priority, experience) = self.tree.get(r) indexes.append(idx) # 後のpriority更新に使う batchs.append(experience) return (indexes, batchs)
def test_add(self): instance = SumTree(4) instance.add(p=1, data=1) np.testing.assert_array_equal([1, 1, 0, 1, 0, 0, 0], instance.tree) instance.add(p=2, data=2) np.testing.assert_array_equal([3, 3, 0, 1, 2, 0, 0], instance.tree)
def __init__( self, buffer_size, batch_size, seed, beta_start=0.4, delta_beta=1e-5, alpha=0.6, eps=1e-8, ): """Initialize PER. Args: buffer_size (int): Size of replay buffer. The actual size will be the first power of 2 greater than buffer_size. batch_size (int): Size of batches to draw. seed (float): Seed. beta_start (float): Initial value for beta (importance sampling exponent) delta_beta (float): Beta increment at each time step. alpha (float): Priority exponent. eps (float): Small positive number to avoid unsampling 0 prioritized examples. """ # Depth of sum tree depth = int(math.log2(buffer_size)) + 1 super(PrioritizeReplayBuffer, self).__init__(2**depth, batch_size, seed) # Initialize sum tree to keep track of the sum of priorities self.priorities = SumTree(depth) # Current max priority self.max_p = 1.0 # PER Parameters self.alpha = alpha self.eps = eps self.beta = beta_start self.delta_beta = delta_beta
class PERMemory: EPSILON = 0.0001 ALPHA = 0.5 BETA = 0.4 size = 0 def __init__(self, config, capacity): self.config = config self.capacity = capacity self.tree = SumTree(capacity) def _getPriority(self, td_error): return (td_error + self.EPSILON) ** self.ALPHA def push(self, transition): self.size += 1 priority = self.tree.max() if priority <= 0: priority = 1 self.tree.add(priority, transition) def sample(self, size, episode): list = [] indexes = [] weights = np.empty(size, dtype='float32') total = self.tree.total() beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes beta = min(1.0, beta) for i, rand in enumerate(np.random.uniform(0, total, size)): (idx, priority, data) = self.tree.get(rand) list.append(data) indexes.append(idx) weights[i] = (self.capacity * priority / total) ** (-beta) return (indexes, list, weights / weights.max()) def update(self, idx, td_error): priority = self._getPriority(td_error) self.tree.update(idx, priority) def __len__(self): return self.size
def __init__(self, config, capacity): self.config = config self.capacity = capacity self.tree = SumTree(capacity)