Ejemplo n.º 1
0
class PERMemory:
    EPSILON = 0.0001
    ALPHA = 0.5
    BETA = 0.4
    size = 0

    def __init__(self, config, capacity):
        self.config = config
        self.capacity = capacity
        self.tree = SumTree(capacity)

    def _getPriority(self, td_error):
        return (td_error + self.EPSILON) ** self.ALPHA

    def push(self, transition):
        self.size += 1

        priority = self.tree.max()
        if priority <= 0:
            priority = 1

        self.tree.add(priority, transition)

    def sample(self, size, episode):
        list = []
        indexes = []
        weights = np.empty(size, dtype='float32')
        total = self.tree.total()
        beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes
        beta = min(1.0, beta)

        for i, rand in enumerate(np.random.uniform(0, total, size)):
            (idx, priority, data) = self.tree.get(rand)
            list.append(data)
            indexes.append(idx)
            weights[i] = (self.capacity * priority / total) ** (-beta)

        return (indexes, list, weights / weights.max())

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)

    def __len__(self):
        return self.size
Ejemplo n.º 2
0
class Memory(object):
    def __init__(self, batch_size, max_size, beta):
        self.batch_size = batch_size  # mini batch大小
        self.max_size = 2**math.floor(
            math.log2(max_size))  # 保证 sum tree 为完全二叉树
        self.beta = beta

        self._sum_tree = SumTree(max_size)

    def store_transition(self, s, a, r, s_, done):
        self._sum_tree.add((s, a, r, s_, done))

    def get_mini_batches(self):
        n_sample = self.batch_size if self._sum_tree.size >= self.batch_size else self._sum_tree.size
        total = self._sum_tree.get_total()

        step = total // n_sample
        points_transitions_probs = []
        for i in range(n_sample):
            v = np.random.uniform(i * step, (i + 1) * step - 1)
            t = self._sum_tree.sample(v)
            points_transitions_probs.append(t)

        points, transitions, probs = zip(*points_transitions_probs)

        # 计算重要性比率
        # max_impmortance_ratio = (n_sample * self._sum_tree.get_min())**-self.beta
        max_impmortance_ratio = (n_sample *
                                 (self._sum_tree.get_min() + 0.0001))
        max_impmortance_ratio = max_impmortance_ratio**-self.beta

        importance_ratio = [
            (n_sample * probs[i])**-self.beta / max_impmortance_ratio
            for i in range(len(probs))
        ]

        return points, tuple(np.array(e)
                             for e in zip(*transitions)), importance_ratio

    def update(self, points, td_error):
        for i in range(len(points)):
            self._sum_tree.update(points[i], td_error[i])
Ejemplo n.º 3
0
    def test_len(self):
        instance = SumTree(4)

        instance.add(p=1, data=1)
        self.assertEqual(len(instance), 1)

        instance.add(p=2, data=2)
        self.assertEqual(len(instance), 2)

        instance.add(p=3, data=3)
        instance.add(p=4, data=4)
        instance.add(p=5, data=5)

        self.assertEqual(len(instance), 4)
Ejemplo n.º 4
0
class Memory(object):
    e = 0.05

    def __init__(self, capacity, pr_scale):
        self.capacity = capacity
        self.memory = ST(self.capacity)
        self.pr_scale = pr_scale
        self.max_pr = 0

    def get_priority(self, error):
        return (error + self.e)**self.pr_scale

    def remember(self, sample, error):
        p = self.get_priority(error)

        self_max = max(self.max_pr, p)
        self.memory.add(self_max, sample)

    def sample(self, n):
        sample_batch = []
        sample_batch_indices = []
        sample_batch_priorities = []
        num_segments = self.memory.total() / n

        for i in range(n):
            left = num_segments * i
            right = num_segments * (i + 1)

            s = random.uniform(left, right)
            idx, pr, data = self.memory.get(s)
            sample_batch.append((idx, data))
            sample_batch_indices.append(idx)
            sample_batch_priorities.append(pr)

        return [sample_batch, sample_batch_indices, sample_batch_priorities]

    def update(self, batch_indices, errors):
        for i in range(len(batch_indices)):
            p = self.get_priority(errors[i])
            self.memory.update(batch_indices[i], p)
Ejemplo n.º 5
0
class Replay_Memory:
    def __init__(self):
        global MEMORY_LEN
        self.tree = SumTree(MEMORY_LEN)

    def add(self, error, sample):
        global MEMORY_BIAS, MEMORY_POW
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.add(priority, sample)

    def sample(self):
        """
         Get a sample batch of the replay memory
        Returns:
         batch: a batch with one sample from each segment of the memory
        """
        global BATCH_SIZE
        batch = []
        #we want one representative of all distribution-segments in the batch
        #e.g BATCH_SIZE=2: batch contains one sample from [min,median]
        #and from [median,max]
        segment = self.tree.total() / BATCH_SIZE
        for i in range(BATCH_SIZE):
            minimum = segment * i
            maximum = segment * (i + 1)
            s = random.uniform(minimum, maximum)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        """
         Updates one entry in the replay memory
        Args:
         idx: the position of the outdated transition in the memory
         error: the newly calculated error
        """
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.update(idx, priority)
Ejemplo n.º 6
0
class ReplayMemory(object):
    def __init__(self, max_size, alpha, eps):
        self.max_size = max_size
        self.alpha = alpha
        self.eps = eps

        self.tree = SumTree(max_size)
        self.last_idxs = None
        self.size = 0

    def get_batch(self, batch_size):
        self.last_idxs = []

        ret = []
        for i in range(min(batch_size, self.size)):
            s = random.random() * self.tree.total()

            idx, _, data = self.tree.get(s)

            ret.append(pickle.loads(zlib.decompress(data)))
            self.last_idxs.append(idx)

        return ret

    def update(self, losses):
        for i in range(len(self.last_idxs)):
            self.tree.update(self.last_idxs[i],
                             math.pow(losses[i] + self.eps, self.alpha))

    def add_element(self, new_el, loss):
        self.size = min(self.max_size, self.size + 1)

        p = math.pow(loss + self.eps, self.alpha)
        self.tree.add(p, zlib.compress(pickle.dumps(new_el)))

    def __len__(self):
        return self.size
Ejemplo n.º 7
0
    def create_tree(sample):
        tree = SumTree(len(sample))
        for e in sample:
            tree.add(p=e, data=e)

        return tree
Ejemplo n.º 8
0
class PriorityMemory(SimpleMemory):
    PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
    PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
    PER_b = 0.4  # importance-sampling, from initial value increasing to 1
    PER_b_increment_per_sampling = 0.001
    absolute_error_upper = 1.  # clipped abs error

    def __init__(self, obs_dim, act_dim, size, act_dtype):
        SimpleMemory.__init__(self, obs_dim, act_dim, size, act_dtype)
        self.tree = SumTree(size)
        self.tree_lock = Lock()

    def store(self, obs, act, rew, next_obs, done):
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])

        # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected
        # So we use a minimum priority
        if max_priority == 0:
            max_priority = self.absolute_error_upper

        insertion_pos = super().store(obs, act, rew, next_obs, done)
        self.tree_lock.acquire()
        insertion_pos_tree = self.tree.add(
            max_priority)  # set the max p for new p
        self.tree_lock.release()
        assert insertion_pos == insertion_pos_tree

    def sample_batch(self, batch_size):
        #idxs = np.random.randint(0, self._size, size=batch_size)
        #return self.obs1_buf[idxs],self.acts_buf[idxs],self.rews_buf[idxs],self.obs2_buf[idxs],self.done_buf[idxs]

        mem_idxs, tree_idxs, b_ISWeights =\
            np.empty((batch_size,), dtype=np.int32),\
            np.empty((batch_size,), dtype=np.int32),\
            np.empty((batch_size, 1), dtype=np.float32)

        # Calculate the priority segment
        # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
        priority_segment = self.tree.total_priority / batch_size  # priority segment

        # Here we increasing the PER_b each time we sample a new minibatch
        self.PER_b = np.min(
            [1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1

        # Calculating the max_weight
        #print('### pp: {}'.format(-self.tree.capacity))
        #print('### pp: {}'.format(self.tree.tree[-self.tree.capacity:]))
        #print('### pp: {}'.format(np.min(self.tree.tree[-self.tree.capacity:])))
        #p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        p_min = self.tree.p_min
        assert p_min > 0
        max_weight = (p_min * batch_size)**(-self.PER_b)
        assert max_weight > 0

        for i in range(batch_size):
            """
            A value is uniformly sample from each range
            """
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            """
            Experience that correspond to each value is retrieved
            """
            assert self.tree.data_pointer > 0
            self.tree_lock.acquire()
            index, priority = self.tree.get_leaf(value)
            self.tree_lock.release()
            assert priority > 0, "### index {}".format(index)

            #P(j)
            sampling_probabilities = priority / self.tree.total_priority

            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            b_ISWeights[i, 0] = batch_size * sampling_probabilities
            assert b_ISWeights[i, 0] > 0
            b_ISWeights[i, 0] = np.power(b_ISWeights[i, 0], -self.PER_b)
            b_ISWeights[i, 0] = b_ISWeights[i, 0] / max_weight

            mem_idxs[i] = index - self.max_size + 1
            tree_idxs[i] = index
            #assert b_idx[i] < self.max_size , "{} and {}".format(b_idx[i], self.max_size)
        return self.obs1_buf[mem_idxs],\
            self.acts_buf[mem_idxs],\
            self.rews_buf[mem_idxs],\
            self.obs2_buf[mem_idxs],\
            self.done_buf[mem_idxs],\
            tree_idxs,\
            b_ISWeights

    """
    Update the priorities on the tree
    """

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)

        self.tree_lock.acquire()
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)
        self.tree_lock.release()
Ejemplo n.º 9
0
class ReplayBuffer:
    def __init__(self, params):

        buffer_size = params['buffer_size']
        batch_size = params['batch_size']
        mode = params['mode']

        self.__buffer_size = buffer_size
        self.__batch_size = batch_size
        self.__mode = mode

        self.__experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.__memory = SumTree(buffer_size)
        self.__memory_buffer = []

    def get_batch_size(self):
        return self.__batch_size

    def is_ready(self):
        return len(self) >= self.__batch_size

    def add(self, state, action, reward, next_state, done):
        self.__memory_buffer.append(
            self.__experience(state, action, reward, next_state, done))

    def sample(self):

        buf_len = len(self.__memory_buffer)
        mem_len = self.__batch_size - buf_len

        experiences = []
        indices = []
        probs = []

        # if self.__mode['PER']:
        if mem_len:
            #segment = self.__memory.total() / mem_len
            for i in range(mem_len):
                #s = random.uniform(segment * i, segment * (i + 1))
                s = random.uniform(0, self.__memory.total())
                idx, p, e = self.__memory.get(s)
                experiences.append(e)
                indices.append(idx)
                probs.append(p / self.__memory.total())

        for e in self.__memory_buffer:
            # Add experience to the buffer and record its index
            experiences.append(e)
            #if self.__mode['PER']:
            idx = self.__memory.add(0.0, e)  # Default value for p is 0
            indices.append(idx)
            probs.append(1 / len(self))

        self.__memory_buffer.clear()

        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       if e is not None]).astype(np.uint8)).float().to(device)

        return states, actions, rewards, next_states, dones, indices, probs

    def update(self, indices, p_values):
        for idx, p in zip(indices, p_values):
            self.__memory.update(idx, p)

    def __len__(self):
        return max(len(self.__memory), len(self.__memory_buffer))
Ejemplo n.º 10
0
class PriorityBuffer:
    # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py

    def __init__(self, action_size, agent_config):
        """Initialize a PriorityBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only)
            beta_start (float): start of beta value for prioritised buffer
            beta_max_steps (int): max number of steps to reach beta value of 1.
        """
        self.action_size = action_size
        self.tree = SumTree(capacity=agent_config.buffer_size)
        self.batch_size = agent_config.batch_size
        # self.seed = random.seed(buffer_config.seed)
        self.epsilon = agent_config.buffer_epsilon
        # how much randomness we require a = 0 (pure random) a = 1 (only priority)
        self.alpha = agent_config.alpha
        self.beta = agent_config.beta_start
        self.beta_start = agent_config.beta_start
        self.beta_end = agent_config.beta_end
        self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps

    def add(self, sample, error):
        """Add a new experience to memory."""
        p = self._get_priority(error)
        state, action, reward, next_state, done = sample
        e = Experience(state, action, reward, next_state, done)
        self.tree.add(p, e)

    def _get_priority(self, error):
        return (abs(error) + self.epsilon) ** self.alpha

    def sample(self):
        experiences = []
        idxs = []
        segment = self.tree.total() / self.batch_size
        priorities = []


        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            if isinstance(data, Experience):
                priorities.append(p)
                experiences.append(data)
                idxs.append(idx)
            else:
                print("WHAT THE HECK !!!")

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling])
        return (states, actions, rewards, next_states, dones), idxs, is_weight

    def update(self, idx, error):
        # Not required in normal ReplayBuffer
        self.tree.update(idx, self._get_priority(error))

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.tree)
Ejemplo n.º 11
0
class PrioritisedReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon):
        self.action_size = action_size
        self.tree = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.alpha = alpha
        self.epsilon = epsilon

    def add(self, error, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        p = self._get_priority(error)
        self.tree.add(p, e)

    def sample(self, beta):
        segment = self.tree.total(
        ) / self.batch_size  # split into segments so we don't end up with duplicates innit

        experiences = []
        priorities = []
        idxs = []

        for i in range(self.batch_size):
            start = segment * i
            end = segment * (i + 1)
            s = random.uniform(start, end)
            idx, p, e = self.tree.get(s)
            if e:
                priorities.append(p)
                experiences.append(e)
                idxs.append(idx)

        probs = priorities / self.tree.total()  # big P
        weights = np.power(self.tree.n_entries * probs, -beta)
        weights /= weights.max()  # scale so max weight is 1

        states = torch.from_numpy(np.vstack([e.state for e in experiences
                                             ])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences
                                              ])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences
                                              ])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       ]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(weights).float().to(device)

        return (states, actions, rewards, next_states, dones, weights, idxs)

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def _get_priority(self, error):
        return (np.abs(error) + self.epsilon)**self.alpha

    def __len__(self):
        """Return the current size of internal memory."""
        return self.tree.n_entries
Ejemplo n.º 12
0
class Memory(object):
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    beta = MEMORY_BETA

    def __init__(self):
        self.limit = MEMORY_CAPACITY
        self.err_tree = SumTree(MEMORY_CAPACITY)
        self.action_shape = (0, MEMORY_ACTION_CNT)
        self.reward_shape = (0, MEMORY_REWARD_CNT)
        self.terminal_shape = self.action_shape
        self.observation_shape = (0, MEMORY_CRITIC_FEATURE_NUM)
        self.store_times = 0
        self.Transition = namedtuple(
            'Transition',
            ('state', 'action', 'reward', 'next_state', 'terminal'))

    def size(self):
        return self.limit if self.store_times > self.limit else self.store_times

    def sample(self, batch_size):
        idxes = np.empty(self.reward_shape, dtype=np.int32)
        isw = np.empty(self.reward_shape, dtype=np.float32)
        obs0 = np.empty(self.observation_shape, dtype=np.float32)
        obs1 = np.empty(self.observation_shape, dtype=np.float32)
        actions = np.empty(self.action_shape, dtype=np.float32)
        rewards = np.empty(self.reward_shape, dtype=np.float32)
        terminals = np.empty(self.terminal_shape, dtype=np.bool)
        nan_state = np.array([np.nan] * self.observation_shape[1])

        self.beta = np.min([1., self.beta + MEMORY_BETA_INC_RATE])  # max = 1
        max_td_err = np.max(self.err_tree.tree[-self.err_tree.capacity:])
        idx_set = set()
        for i in range(
                batch_size * 2
        ):  # sample maximum batch_size * 2 times to get batch_size different instances
            v = np.random.uniform(0, self.err_tree.total_p)
            idx, td_err, trans = self.err_tree.get_leaf(v)
            if batch_size == len(idx_set):
                break
            if idx not in idx_set:
                idx_set.add(idx)
            else:
                continue
            if (trans.state == 0).all():
                continue
            idxes = np.row_stack((idxes, np.array([idx])))
            isw = np.row_stack((isw,
                                np.array([
                                    np.power(
                                        self._getPriority(td_err) / max_td_err,
                                        -self.beta)
                                ])))
            obs0 = np.row_stack((obs0, trans.state))
            obs1 = np.row_stack(
                (obs1,
                 nan_state if trans.terminal.all() else trans.next_state))
            actions = np.row_stack((actions, trans.action))
            rewards = np.row_stack((rewards, trans.reward))
            terminals = np.row_stack((terminals, trans.terminal))

        result = {
            'obs0': array_min2d(obs0),
            'actions': array_min2d(actions),
            'rewards': array_min2d(rewards),
            'obs1': array_min2d(obs1),
            'terminals': array_min2d(terminals),
        }

        return idxes, result, isw

    def _getPriority(self, error):
        return (error + EPSILON)**MEMORY_ALPHA

    def append(self, obs0, action, reward, obs1, terminal, err, training=True):
        if not training:
            return
        trans = self.Transition(obs0, action, reward, obs1, terminal)
        self.err_tree.add(self._getPriority(err), trans)
        self.store_times += 1

    def batch_update(self, tree_idx, errs):
        errs = np.abs(errs) + EPSILON  # convert to abs and avoid 0
        ps = np.power(errs, MEMORY_ALPHA)
        for ti, p in zip(tree_idx, ps):
            self.err_tree.update(ti, p[0])

    @property
    def nb_entries(self):
        return self.store_times
Ejemplo n.º 13
0
class MemoryDB:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001
    capacity = 100000
    max_priority = 1

    def __init__(self, host_name, db_name, collection_name):
        self.host_name = host_name
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = MongoClient(host_name, 27017)
        self.db = self.client[db_name]
        self.replay_memory_collection = self.db[collection_name]
        self.sum_tree = SumTree(self.capacity)
        memory_priorities = self.replay_memory_collection.find({},
                                                               {"priority": 1})
        for memory_priority in memory_priorities:
            self.sum_tree.add(memory_priority["priority"],
                              {"_id": memory_priority["_id"]})

    def retrieve_by_id(self, id):
        db_experiences = self.replay_memory_collection.find({"_id": id})
        return {
            **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id":
            id
        }

    def _get_priority(self, error):
        return (error + self.e)**self.a

    def add(self, error, experience):
        p = self._get_priority(error)
        experience_to_save = {}
        experience_to_save["terminal"] = experience["terminal"]
        experience_to_save["action_index"] = experience["action_index"]
        experience_to_save["actual_reward"] = experience["actual_reward"]
        experience_to_save["priority"] = self.max_priority
        experience_to_save["binary"] = _pickle.dumps(experience)
        id = self.replay_memory_collection.insert(experience_to_save)

        self.sum_tree.add(p, {"_id": id})

    def add_batch(self, experiences):
        for experience in experiences:
            self.add(self.max_priority, experience)

    def update(self, index, error, experience):
        p = self._get_priority(error)
        self.replay_memory_collection.update_one({"_id": experience["_id"]},
                                                 {"$set": {
                                                     "priority": p
                                                 }})
        self.sum_tree.update(index, p)

    def update_batch(self, indexes, errors, experiences):
        for index, error, experience in zip(indexes, errors, experiences):
            self.update(index, error, experience)

    def get_experiences_size(self):
        return self.replay_memory_collection.count()

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.sum_tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.sum_tree.get(s)
            priorities.append(p)
            experience = self.retrieve_by_id(data["_id"])
            batch.append(experience)
            print(
                "action index: ",
                experience["action_index"],
                "reward: ",
                experience["actual_reward"],
                "priority: ",
                experience["priority"],
            )
            idxs.append(idx)

        sampling_probabilities = priorities / self.sum_tree.total()
        is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight
Ejemplo n.º 14
0
class PrioritizedReplayBuffer:
    """
    Memory buffer responsible for Prioritized Experience Replay.

    This buffer stores up to memory_size experiences in a circular
    array-like data structure.  Each experience is also associated
    with a probability weight.

    Batches may be sampled (with replacement) from this implied
    probability distribution in batches.  The provided weights should
    be non-negative, but are not required to add up to 1.
    """

    def __init__(self, device, memory_size, update_every=4, seed=0):
        """  Initializes the data structure

        :param device:  (torch.device) Object representing the device where to allocate tensors
        :param memory_size: (int) Maximum capacity of memory buffer
        :param update_every: (int) Number of steps between update operations
        :param seed:  (int) Seed used for PRNG
        """
        self.device = device
        self.probability_weights = SumTree(capacity=memory_size, seed=seed)
        self.elements = deque(maxlen=memory_size)
        self.update_every = update_every

        self.step = 0
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        """  Adds a experience tuple (s, a, r, s', done) to memory

        :param state:  (array-like)  State value from experience tuple
        :param action:  (int)  Action value from experience tuple
        :param reward:  (float)  Reward value from experience tuple
        :param next_state:  (array-like)  Next state value from experience tuple
        :param done:  (bool)  Done flag from experience tuple
        """
        e = self.experience(state, action, reward, next_state, done)
        self.elements.append(e)
        self.step += 1

        # Add batch of experiences to memory, with max initial weight
        if self.step >= self.update_every:
            self.probability_weights.add(self.step)
            self.step = 0

    def sample(self, batch_size, alpha, beta):
        """  Samples a batch of examples with replacement from the buffer.

        :param batch_size:  (int)  Number of samples to sample
        :param alpha:  (float) PER probability hyperparameter
        :param beta:  (float) PER probability hyperparameter
        :return:
            states:  (list)  States from sampled experiences
            actions:  (list)  Actions from sampled experiences
            rewards:  (list)  Rewards from sampled experiences
            next_states:  (list)  Next states from sampled experiences
            dones:  (list)  Done flags from sampled experiences
            indexes:  (list)  Indexes of sampled experiences
        """
        indexes = self.probability_weights.sample(batch_size=batch_size, alpha=alpha, beta=beta)
        experiences = [self.elements[i] for i in indexes]

        # Copy experience tensors to device
        states = torch.from_numpy(np.vstack([e.state for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences]).astype(np.uint8)).float().to(self.device)

        return states, actions, rewards, next_states, dones, indexes

    def update(self, indexes, weights):
        """  Updates the probability weights associated with the provided indexes.

        :param indexes:  (array indexes) Indexes to have weights updated
        :param weights:  (list) New weights for the provided indexes
        """
        self.probability_weights.update(indexes, weights)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.probability_weights)
class PrioritizedReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            seed (int): random seed
        """
        self.memory = SumTree(buffer_size)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

        # epsilon: small amount to avoid zero priority
        # alpha: [0~1] determines how much prioritization is used. with 0, we would get the uniform case
        # beta: Controls importance-sampling compensation. fully compensates for the non-uniform probabilities
        #   when beta=1. The unbiased nature of the updates is most important near convergence at the end of
        #   training, so we define a schedule on the exponent beta that starts from initial value and reaches 1
        #   only at the end of learning.

        self.epsilon = 0.01
        self.alpha = 0.6
        
        beta_start = 0.4
        self.beta_end = 1.0
        self.beta = beta_start
        beta_increments = 200
        self.beta_increment = (self.beta_end - beta_start)/beta_increments

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        experience = self.experience(state, action, reward, next_state, done)
        p = self.memory.max_p()
        if p == 0:
            p = 1.0
        self.memory.add(p=p, data=experience)

    def sample(self, n):
        """Randomly sample a batch of experiences from memory."""
        experiences = []
        indices = []
        priorities = []
        segment = self.memory.total_p() / n
        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, experience) = self.memory.get(s)
            experiences.append(experience)
            indices.append(idx)
            priorities.append(p)
        priorities = np.array(priorities, dtype=np.float64)
        indices = np.array(indices, dtype=np.int32)

        # print(f"priorities: {priorities}")
        probs = priorities / self.memory.total_p()
        # print(f"probs: {probs}")
        # importance-sampling (IS) weights
        w_is = (self.memory.capacity * probs) ** (-self.beta)
        # print(f"w_IS: {w_IS}")
        w_is_normalized = w_is/w_is.max()
        # print(f"w_IS_normalized: {w_IS_normalized}")
        # w_is_normalized = torch.from_numpy(w_is_normalized).float().to(self.device)
        
        return experiences, indices, w_is_normalized

    def update_errors(self, indices, errors):
        priorities = [self._to_priority(e) for e in errors]
        for (idx, p) in zip(indices, priorities):
            self.memory.update(idx, p)

    def _to_priority(self, error):
        return (error + self.epsilon) ** self.alpha
    
    def increase_beta(self):
        if self.beta < self.beta_end:
            self.beta = min(self.beta_end, self.beta + self.beta_increment)

    def __len__(self):
        return len(self.memory)
Ejemplo n.º 16
0
class PrioritizedExperienceReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    alpha = 0.6
    beta = 0.4
    beta_increment_per_sample = 0.001
    epsilon = 1e-6

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def compute_priority(self, td_error):
        return (td_error + self.epsilon) ** self.alpha

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        experience = self.experience(state, action, reward, next_state, done)
        max_priority = np.max(self.memory.tree[-self.memory.capacity:])
        if max_priority == 0:
            max_priority = 1.

        self.memory.add(max_priority, experience)

    def update(self, index, td_error):
        priority = self.compute_priority(td_error)
        self.memory.update(index, priority)

    def sample(self):
        """

        :return: importance weights, indices of sampled experiences, and sampled batch of experiences
        """
        self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample)
        segment = self.memory.total() / self.batch_size
        indexes = []
        priorities = []
        experiences = []

        for i in range(self.batch_size):
            # pick a segment
            a = segment * i
            b = segment * (i + 1)
            s = np.random.uniform(a, b)

            index, priority, experience = self.memory.get(s)
            indexes.append(index)
            priorities.append(priority)
            experiences.append(experience)

        sampling_probs = np.divide(priorities, self.memory.total())
        # importance sampling
        i_s_weights = (self.batch_size * sampling_probs) ** -self.beta
        i_s_weights /= np.max(i_s_weights)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        return i_s_weights, indexes, (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return self.memory.count
Ejemplo n.º 17
0
class ReplayMemory:
    def __init__(self, memory_size):
        self.memory_size = memory_size
        self.memory = SumTree(memory_size)
        self.epsilon = 0.0001  # small amount to avoid zero priority
        self.alpha = 0.6  # adj_pri = pri^alpha
        self.beta = 0.4  # importance-sampling, from initial value increasing to 1
        self.beta_max = 1
        self.beta_increment_per_sampling = 0.001
        self.abs_err_upper = 1.  # clipped td error

    def add(self, row):
        max_p = np.max(
            self.memory.tree[-self.memory.capacity:])  # max adj_pri of leaves
        if max_p == 0:
            max_p = self.abs_err_upper
        self.memory.add(max_p, row)  # set the max adj_pri for new adj_pri

    def get_batch(self, batch_size):
        leaf_idx, batch_memory, ISWeights = np.empty(
            batch_size,
            dtype=np.int32), np.empty(batch_size,
                                      dtype=object), np.empty(batch_size)
        pri_seg = self.memory.total_p / batch_size  # adj_pri segment
        self.beta = np.min(
            [self.beta_max,
             self.beta + self.beta_increment_per_sampling])  # max = 1

        # Pi = Prob(i) = softmax(priority(i)) = adj_pri(i) / ∑_i(adj_pri(i))
        # ISWeight = (N*Pj)^(-beta) / max_i[(N*Pi)^(-beta)] = (Pj / min_i[Pi])^(-beta)
        min_prob = np.min(
            self.memory.tree[self.memory.capacity - 1:self.memory.capacity -
                             1 + self.memory.counter]) / self.memory.total_p
        for i in range(batch_size):
            # sample from each interval
            a, b = pri_seg * i, pri_seg * (i + 1)  # interval
            v = np.random.uniform(a, b)
            idx, p, data = self.memory.get_leaf(v)
            prob = p / self.memory.total_p
            ISWeights[i] = np.power(prob / min_prob, -self.beta)
            leaf_idx[i], batch_memory[i] = idx, data
        return leaf_idx, batch_memory, ISWeights

    def update_sum_tree(self, tree_idx, td_errors):
        for ti, td_error in zip(tree_idx, td_errors):
            p = self._calculate_priority(td_error)
            self.memory.update(ti, p)

    def _calculate_priority(self, td_error):
        priority = abs(td_error) + self.epsilon
        clipped_pri = np.minimum(priority, self.abs_err_upper)
        return np.power(clipped_pri, self.alpha)

    @property
    def length(self):
        return self.memory.counter

    def load_memory(self, memory):
        self.memory = memory

    def get_memory(self):
        return self.memory
Ejemplo n.º 18
0
class Memory(object):
    def __init__(self,
                 capacity,
                 state_size=37,
                 epsilon=0.001,
                 alpha=0.4,
                 beta=0.3,
                 beta_increment_per_sampling=0.001,
                 abs_err_upper=1):
        self.tree = SumTree(capacity)
        self.epsilon = epsilon  # Avoid 0 priority and hence a do not give a chance for the priority to be selected stochastically
        self.alpha = alpha  # Vary priority vs randomness. alpha = 0 pure uniform randomnes. Alpha = 1, pure priority
        self.beta = beta  # importance-weight-sampling, from small to big to give more importance to corrections done towards the end of the training
        self.beta_increment_per_sampling = 0.001
        self.abs_err_upper = 1  # clipped abs error
        self.state_size = state_size

    # Save experience in memory
    def store(self, state, action, reward, next_state, done):
        transition = [state, action, reward, next_state, done]
        max_p = np.max(self.tree.tree[-self.tree.capacity:])

        # In case of no priority, we set abs error to 1
        if max_p == 0:
            max_p = self.abs_err_upper
        self.tree.add(max_p, transition)  # set the max p for new p

    # Sample n amount of experiences using prioritized experience replay
    def sample(self, n):
        b_idx = np.empty((n, ), dtype=np.int32)
        states = np.empty((n, self.state_size))
        actions = np.empty((n, ))
        rewards = np.empty((n, ))
        next_states = np.empty((n, self.state_size))
        dones = np.empty((n, ))
        ISWeights = np.empty((n, ))  # IS -> Importance Sampling

        pri_seg = self.tree.total_p / n  # priority segment
        self.beta = np.min([
            1., self.beta + self.beta_increment_per_sampling
        ])  # Increase the importance of the sampling for ISWeights

        # min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p  # for later calculate ISweight

        for i in range(n):
            a, b = pri_seg * i, pri_seg * (i + 1)
            v = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(v)
            prob = p / self.tree.total_p
            ISWeights[i] = np.power(prob, -self.beta)
            b_idx[i] = idx
            states[i, :] = data[0]
            actions[i] = data[1]
            rewards[i] = data[2]
            next_states[i, :] = data[3]
            dones[i] = data[4]

        states = torch.from_numpy(np.vstack(states)).float().to(device)
        actions = torch.from_numpy(np.vstack(actions)).long().to(device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(device)
        next_states = torch.from_numpy(
            np.vstack(next_states)).float().to(device)
        dones = torch.from_numpy(np.vstack(dones).astype(
            np.uint8)).float().to(device)
        ISWeights = torch.from_numpy(np.vstack(ISWeights)).float().to(device)

        return b_idx, states, actions, rewards, next_states, dones, ISWeights

    # Update the priorities according to the new errors
    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.epsilon  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
        ps = np.power(clipped_errors, self.alpha)
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

    def __len__(self):
        return self.tree.length()
Ejemplo n.º 19
0
class PrioritizeReplayBuffer(ReplayBuffer):
    """Prioritize experience replay."""
    def __init__(
        self,
        buffer_size,
        batch_size,
        seed,
        beta_start=0.4,
        delta_beta=1e-5,
        alpha=0.6,
        eps=1e-8,
    ):
        """Initialize PER.

        Args:
            buffer_size (int): Size of replay buffer. The actual size will be the
                first power of 2 greater than buffer_size.
            batch_size (int): Size of batches to draw.
            seed (float): Seed.
            beta_start (float): Initial value for beta (importance sampling exponent)
            delta_beta (float): Beta increment at each time step.
            alpha (float): Priority exponent.
            eps (float): Small positive number to avoid unsampling 0 prioritized examples.
        """
        # Depth of sum tree
        depth = int(math.log2(buffer_size)) + 1
        super(PrioritizeReplayBuffer, self).__init__(2**depth, batch_size,
                                                     seed)

        # Initialize sum tree to keep track of the sum of priorities
        self.priorities = SumTree(depth)

        # Current max priority
        self.max_p = 1.0

        # PER Parameters
        self.alpha = alpha
        self.eps = eps
        self.beta = beta_start
        self.delta_beta = delta_beta

    def add(self, state, action, reward, next_state, done):
        """Add transition inside the Replay buffer."""
        # Add in the sum tree with current max priority
        self.priorities.add(self.max_p, self.index)
        super().add(state, action, reward, next_state, done)

    def sample(self):
        """Get sample."""
        # Get indices to sample from sum tree
        # Store these indices to compute importance sampling later
        self.last_indices = self.priorities.sample(self.batch_size)

        # Return transitions corresponding to this indices
        return [self.data[i] for i in self.last_indices]

    def update_priorities(self, td_error):
        """Update priorities."""
        # Compute new priorites
        new_priorities = (abs(td_error) + self.eps)**self.alpha

        # Update sum tree
        self.priorities.update(self.last_indices, new_priorities)

        # Update the current max priority
        self.max_p = max(self.max_p, max(new_priorities))

    def importance_sampling(self):
        """Compute importance sampling weights of last sample."""
        # Get probabilities
        probs = self.priorities.get(
            self.last_indices) / self.priorities.total_sum

        # Compute weights
        weights = (len(self) * probs)**(-self.beta)
        weights /= max(weights)

        # Update beta
        self.beta = min(self.beta + self.delta_beta, 1)

        # Return weights
        return weights
class PrioritisedReplayBuffer():
    """A prioritised replay buffer.

    Creates a sum tree and uses it to stores a fixed number of experience tuples. When sampled
    experiences are returned with greater priority given to those with the highest absolute TD-error.
    """
    def __init__(self,
                 buffer_size,
                 alpha,
                 beta_zero,
                 beta_increment_size=0.001,
                 epsilon=0.1,
                 max_priority=1.,
                 seed=None):
        """Priority replay buffer initialiser.

        Args:
            buffer_size (int): capacity of the replay buffer.
            alpha (float): priority scaling hyperparameter.
            beta_zero (float): importance sampling scaling hyperparameter.
            beta_increment_size (float): beta annealing rate.
            epsilon (float): base priority to ensure non-zero sampling probability.
            max_priority (float): initial maximum priority.
            seed (int): seed for random number generator
       """
        random.seed(seed)

        self.sum_tree = SumTree(buffer_size)
        self.memory = {}
        self.experience = namedtuple(
            "experience", ["state", "action", "reward", "next_state", "done"])
        self.buffer_size = buffer_size
        self.beta_increment_size = beta_increment_size
        self.max_priority = max_priority**alpha
        self.min_priority = max_priority**alpha
        self.last_min_update = 0

        self.alpha = alpha
        self.beta = beta_zero
        self.epsilon = epsilon

    def add(self, state, action, reward, next_state, done):
        """Creates experience tuple and adds it to the replay buffer."""
        experience = self.experience(state, action, reward, next_state, done)
        current_tree_idx = self.sum_tree.input_pointer
        self.memory[current_tree_idx] = experience
        self.sum_tree.add(self.max_priority)

    def sample(self, batch_size):
        """Returns a batch of experiences sampled according to their priority."""
        idx_list = []
        weights = []
        states = []
        actions = []
        rewards = []
        next_states = []
        done_list = []

        segment = self.sum_tree.total() / batch_size
        sample_list = [
            random.uniform(segment * i, segment * (i + 1))
            for i in range(batch_size)
        ]
        max_weight = self.min_priority**(-self.beta)

        for s in sample_list:
            idx, priority = self.sum_tree.sample(s)
            idx_list.append(idx)
            weight = priority**(-self.beta) / max_weight
            weights.append(weight)

            sample = self.memory[idx]
            state, action, reward, next_state, done = sample
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            done_list.append(done)

        return states, actions, rewards, next_states, done_list, idx_list, weights

    def update(self, idx_list, td_error):
        """Updates a specifics experience's priority."""
        priority_list = (td_error + self.epsilon)**self.alpha

        self.max_priority = max(self.max_priority, priority_list.max())
        list_min_priority = priority_list.min()

        if list_min_priority <= self.min_priority:
            self.min_priority = list_min_priority
            self.last_min_update = 0
        else:
            self.last_min_update += 1

        if self.last_min_update >= self.buffer_size:
            self.min_priority = np.array([
                node.val
                for node in self.sum_tree.tree_array[-self.buffer_size:]
            ]).min()
            self.last_min_update = 0

        for i, idx in enumerate(idx_list):
            priority = min(self.max_priority, priority_list[i])
            self.sum_tree.update(idx, priority)

        self.beta = min(1, self.beta + self.beta_increment_size)

    def __len__(self, ):
        """Return number of experiences in the replay buffer."""
        return len(self.memory)