Esempio n. 1
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, buffer_size, alpha):
        self.capacity = buffer_size
        self.tree = SumTree(buffer_size)
        self.alpha = alpha
        self.max_priority = 1
        #self.beta_initial = ??
        #self.beta_steps = ??

    def add(self, experience):
        self.tree.add(self.max_priority, experience)

    def update(self, index, experience, td_error):
        priority = (abs(td_error) + 0.0001)**self.alpha
        self.tree.update(index, priority)
        if self.max_priority < priority:
            self.max_priority = priority

    def sample(self, batch_size):
        indexes = []
        batchs = []
        total = self.tree.total()
        section = total / batch_size
        for i in range(batch_size):
            r = section * i + np.random.random() * section
            (idx, priority, experience) = self.tree.get(r)
            indexes.append(idx)  # 後のpriority更新に使う
            batchs.append(experience)
        return (indexes, batchs)
Esempio n. 2
0
class Memory:
    e = 0.01
    a = 0.6

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)
Esempio n. 3
0
class PrioritisedMemory(object):
    def __init__(self, alpha, beta, beta_end, epsilon, num_steps, replay_size):

        self.alpha = alpha
        self.beta_start = beta
        self.beta_end = beta_end
        self.beta = beta
        self.epsilon = epsilon
        self.num_steps = num_steps

        self.memory = SumTree(replay_size)
        self.replay_size = replay_size

    def proprotional_priority(self, td_error):

        return (np.abs(td_error) + self.epsilon)**self.alpha

    def add_memory(self, td_error, data):

        priority = self.proprotional_priority(td_error)

        self.memory.add_memory(data, priority)

        self.beta = np.min([
            1.0, self.beta + (self.beta_end - self.beta_start) / self.num_steps
        ])

    def update_priority(self, index, td_error):

        new_priority = self.proprotional_priority(td_error)
        self.memory.update_priority(index, new_priority)

    def minibatch_sample(self, minibatch_size):

        samples = []
        priorities = []
        priority_indexes = []

        interval = self.memory.priority_total() / minibatch_size

        for i in range(minibatch_size):

            sample = np.random.uniform(i * interval, (i + 1) * interval)

            priority_index, priority, data = self.memory.get(sample)

            samples.append(data)

            priorities.append(priority)

            priority_indexes.append(priority_index)

        sampling_probabilities = priorities / self.memory.priority_total()
        importance_weights = np.power(
            self.memory.replay_size * sampling_probabilities, -self.beta)
        importance_weights /= np.max(is_weight)

        return priority_indexes, samples, importance_weights
Esempio n. 4
0
class PriorityMemory:   

    def __init__(self, capacity):
        """
        Instantiate a priority based memory with capable of holding
        capacity experiences. Memories are sampled with frequency
        based on their priority.
        """
        # Circular buffer array based tree with priorities as node values.
        self.tree = SumTree(capacity)
        self.e = 0.01 # Small constant to ensure all priorities > 0
        self.a = 0.6  # Constant to control the weight of error on priority

    def _getPriority(self, error):
        """
        Convert error to a priority based on the constants "e" and "a"
        """
        return (error + self.e) ** self.a

    def add(self, experience, error):
        """
        Add an experience to memory
        """
        p = self._getPriority(error)
        self.tree.add(p, experience) 

    def sample(self, n):
        """
        Sample n experiences from memory. Experiences selection
        frequency is based on priority.

        Returns:
            - mini_batch: Sequence containing the experiences.
            - indicies: The index of the node associated with each experience 
              so that its priority can be updated.
        """
        mini_batch = []
        indicies = []

        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, _, experience) = self.tree.get(s)
            mini_batch.append(experience)
            indicies.append(idx)

        return mini_batch, indicies

    def update(self, idx, error):
        """
        Update the priority associated with a memory.
        """
        p = self._getPriority(error)
        self.tree.update(idx, p)
Esempio n. 5
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    def __init__(self,
                 capacity,
                 alpha=0.6,
                 beta=0.4,
                 beta_anneal_step=0.001,
                 epsilon=0.00000001):
        tree_capacity = 1
        while tree_capacity < size:
            tree_capacity *= 2
        self.tree = SumTree(capacity)
        self.capacity = capacity
        self.a = alpha
        self.beta = beta
        self.beta_increment_per_sampling = beta_anneal_step
        self.e = epsilon

    def _get_priority(self, error):
        # Direct proportional prioritization
        return (np.abs(error) + self.e)**self.a

    def add(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            data = 0

            while data == 0:
                s = random.uniform(a, b)
                (idx, p, data) = self.tree.get(s)

            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def step(self):
        self.beta = np.min(
            [1. - self.e, self.beta + self.beta_increment_per_sampling])

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)
Esempio n. 6
0
class MemoryDB:  # stored as ( s, a, r, s_ ) in SumTree
    def __init__(self, e, a, beta, beta_increment_per_sampling, capacity,
                 max_priority):
        self.capacity = capacity
        self.e = e
        self.a = a
        self.beta = beta
        self.beta_increment_per_sampling = beta_increment_per_sampling
        self.capacity = capacity
        self.max_priority = max_priority
        self.sum_tree = SumTree(self.capacity)

    def _get_priority(self, error):
        return min((self.max_priority, (error + self.e)**self.a))

    def add(self, experience, error=None):
        p = self._get_priority(error) if error != None else self.max_priority
        self.sum_tree.add(p, experience)

    def add_batch(self, experiences):
        for experience in experiences:
            self.add(experience, self.max_priority)

    def update(self, index, error, experience):
        p = self._get_priority(error)
        self.sum_tree.update(index, p)

    def update_batch(self, indexes, errors, experiences):
        for index, error, experience in zip(indexes, errors, experiences):
            self.update(index, error, experience)

    def get_experiences_size(self):
        return self.sum_tree.getCount()

    def sample(self, n):

        batch = []
        idxs = []
        segment = self.sum_tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            s = random.uniform(a, b)
            (idx, p, data) = self.sum_tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.sum_tree.total()
        is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()
        return batch, idxs, is_weight
Esempio n. 7
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    e = 1e-10
    a = 0.5
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (error + self.e)**self.a

    def append(self, data):
        error, sample = data
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / (self.tree.total() + 1e-10)
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def __len__(self):
        return self.tree.n_entries
Esempio n. 8
0
class PrioritizedReplayMemory:

    def __init__(self, capacity, alpha=0.6, eps=1e-2):
        self.tree = SumTree(capacity)
        self.alpha = alpha # alpha determines how much prioritization is used
        self.eps = eps # epsilon smooths priority, priority = (TD_error + eps) ** alpha

    def _get_priority(self, td_error):
        return (td_error + self.eps) ** self.alpha

    def current_length(self):
        return self.tree.current_length()

    def total_sum(self):
        return self.tree.total_sum()

    def push(self, event, td_error):
        priority = self._get_priority(td_error)
        self.tree.insert(event, priority)

    def sample(self, batch_sz):
        batch = []
        indices = []
        priorities = []
        segment = self.tree.total_sum() / batch_sz

        for i in range(batch_sz):
            l = segment * i
            r = segment * (i + 1)

            s = random.uniform(l, r)
            (idx, priority, data) = self.tree.get(s)
            batch.append(data)
            indices.append(idx)
            priorities.append(priority)

        samples = map(np.array, zip(*batch))

        return samples, indices, priorities

    def update(self, idx, td_error):
        if isinstance(idx, list):
            for i in range(len(idx)):
                priority = self._get_priority(td_error[i])
                self.tree.update(idx[i], priority)
        else:
            priority = self._get_priority(td_error)
            self.tree.update(idx, priority)
Esempio n. 9
0
class PERMemory:
    EPSILON = 0.0001
    ALPHA = 0.5
    BETA = 0.4
    size = 0

    def __init__(self, config, capacity):
        self.config = config
        self.capacity = capacity
        self.tree = SumTree(capacity)

    def _getPriority(self, td_error):
        return (td_error + self.EPSILON) ** self.ALPHA

    def push(self, transition):
        self.size += 1

        priority = self.tree.max()
        if priority <= 0:
            priority = 1

        self.tree.add(priority, transition)

    def sample(self, size, episode):
        list = []
        indexes = []
        weights = np.empty(size, dtype='float32')
        total = self.tree.total()
        beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes
        beta = min(1.0, beta)

        for i, rand in enumerate(np.random.uniform(0, total, size)):
            (idx, priority, data) = self.tree.get(rand)
            list.append(data)
            indexes.append(idx)
            weights[i] = (self.capacity * priority / total) ** (-beta)

        return (indexes, list, weights / weights.max())

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)

    def __len__(self):
        return self.size
Esempio n. 10
0
class PrioritizeReplayBuffer(ReplayBuffer):
    # Based on https://github.com/y-kamiya/machine-learning-samples/blob/7b6792ce37cc69051e9053afeddc6d485ad34e79/python3/reinforcement/dqn/agent.py
    EPSILON = 0.0001
    ALPHA = 0.6
    BETA = 0.4
    size = 0

    def __init__(self, capacity):
        super().__init__(capacity=capacity)
        self.td_error_epsilon = 0.0001
        self.tree = SumTree(capacity)

    def __len__(self):
        return self.size

    def _getPriority(self, td_error):
        return (td_error + self.EPSILON)**self.ALPHA

    def push(self, state, action, done, next_state, reward, p_index):
        self.size += 1
        transition = self.Transition(state, action, done, next_state, reward,
                                     p_index)
        priority = self.tree.max()
        if priority <= 0:
            priority = 1
        self.tree.add(priority, transition)

    def sample(self, batch_size, episode):
        list = []
        indexes = []
        weights = np.empty(batch_size, dtype='float32')
        total = self.tree.total()
        beta = self.BETA + (
            1 - self.BETA) * episode  #episode / self.config.num_episodes

        for i, rand in enumerate(np.random.uniform(0, total, batch_size)):
            (idx, priority, data) = self.tree.get(rand)
            list.append(data)
            indexes.append(idx)
            weights[i] = (self.capacity * priority / total)**(-beta)
        return (indexes, list, weights / weights.max())

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)
Esempio n. 11
0
class PrioritizedMemory:
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (np.abs(error) + self.e)**self.a

    def push(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)
Esempio n. 12
0
class Memory(object):
    e = 0.05

    def __init__(self, capacity, pr_scale):
        self.capacity = capacity
        self.memory = ST(self.capacity)
        self.pr_scale = pr_scale
        self.max_pr = 0

    def get_priority(self, error):
        return (error + self.e)**self.pr_scale

    def remember(self, sample, error):
        p = self.get_priority(error)

        self_max = max(self.max_pr, p)
        self.memory.add(self_max, sample)

    def sample(self, n):
        sample_batch = []
        sample_batch_indices = []
        sample_batch_priorities = []
        num_segments = self.memory.total() / n

        for i in range(n):
            left = num_segments * i
            right = num_segments * (i + 1)

            s = random.uniform(left, right)
            idx, pr, data = self.memory.get(s)
            sample_batch.append((idx, data))
            sample_batch_indices.append(idx)
            sample_batch_priorities.append(pr)

        return [sample_batch, sample_batch_indices, sample_batch_priorities]

    def update(self, batch_indices, errors):
        for i in range(len(batch_indices)):
            p = self.get_priority(errors[i])
            self.memory.update(batch_indices[i], p)
Esempio n. 13
0
class Replay_Memory:
    def __init__(self):
        global MEMORY_LEN
        self.tree = SumTree(MEMORY_LEN)

    def add(self, error, sample):
        global MEMORY_BIAS, MEMORY_POW
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.add(priority, sample)

    def sample(self):
        """
         Get a sample batch of the replay memory
        Returns:
         batch: a batch with one sample from each segment of the memory
        """
        global BATCH_SIZE
        batch = []
        #we want one representative of all distribution-segments in the batch
        #e.g BATCH_SIZE=2: batch contains one sample from [min,median]
        #and from [median,max]
        segment = self.tree.total() / BATCH_SIZE
        for i in range(BATCH_SIZE):
            minimum = segment * i
            maximum = segment * (i + 1)
            s = random.uniform(minimum, maximum)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        """
         Updates one entry in the replay memory
        Args:
         idx: the position of the outdated transition in the memory
         error: the newly calculated error
        """
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.update(idx, priority)
Esempio n. 14
0
class ReplayMemory(object):
    def __init__(self, max_size, alpha, eps):
        self.max_size = max_size
        self.alpha = alpha
        self.eps = eps

        self.tree = SumTree(max_size)
        self.last_idxs = None
        self.size = 0

    def get_batch(self, batch_size):
        self.last_idxs = []

        ret = []
        for i in range(min(batch_size, self.size)):
            s = random.random() * self.tree.total()

            idx, _, data = self.tree.get(s)

            ret.append(pickle.loads(zlib.decompress(data)))
            self.last_idxs.append(idx)

        return ret

    def update(self, losses):
        for i in range(len(self.last_idxs)):
            self.tree.update(self.last_idxs[i],
                             math.pow(losses[i] + self.eps, self.alpha))

    def add_element(self, new_el, loss):
        self.size = min(self.max_size, self.size + 1)

        p = math.pow(loss + self.eps, self.alpha)
        self.tree.add(p, zlib.compress(pickle.dumps(new_el)))

    def __len__(self):
        return self.size
Esempio n. 15
0
class PrioritisedReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon):
        self.action_size = action_size
        self.tree = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.alpha = alpha
        self.epsilon = epsilon

    def add(self, error, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        p = self._get_priority(error)
        self.tree.add(p, e)

    def sample(self, beta):
        segment = self.tree.total(
        ) / self.batch_size  # split into segments so we don't end up with duplicates innit

        experiences = []
        priorities = []
        idxs = []

        for i in range(self.batch_size):
            start = segment * i
            end = segment * (i + 1)
            s = random.uniform(start, end)
            idx, p, e = self.tree.get(s)
            if e:
                priorities.append(p)
                experiences.append(e)
                idxs.append(idx)

        probs = priorities / self.tree.total()  # big P
        weights = np.power(self.tree.n_entries * probs, -beta)
        weights /= weights.max()  # scale so max weight is 1

        states = torch.from_numpy(np.vstack([e.state for e in experiences
                                             ])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences
                                              ])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences
                                              ])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       ]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(weights).float().to(device)

        return (states, actions, rewards, next_states, dones, weights, idxs)

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def _get_priority(self, error):
        return (np.abs(error) + self.epsilon)**self.alpha

    def __len__(self):
        """Return the current size of internal memory."""
        return self.tree.n_entries
Esempio n. 16
0
class PriorityBuffer:
    # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py

    def __init__(self, action_size, agent_config):
        """Initialize a PriorityBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only)
            beta_start (float): start of beta value for prioritised buffer
            beta_max_steps (int): max number of steps to reach beta value of 1.
        """
        self.action_size = action_size
        self.tree = SumTree(capacity=agent_config.buffer_size)
        self.batch_size = agent_config.batch_size
        # self.seed = random.seed(buffer_config.seed)
        self.epsilon = agent_config.buffer_epsilon
        # how much randomness we require a = 0 (pure random) a = 1 (only priority)
        self.alpha = agent_config.alpha
        self.beta = agent_config.beta_start
        self.beta_start = agent_config.beta_start
        self.beta_end = agent_config.beta_end
        self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps

    def add(self, sample, error):
        """Add a new experience to memory."""
        p = self._get_priority(error)
        state, action, reward, next_state, done = sample
        e = Experience(state, action, reward, next_state, done)
        self.tree.add(p, e)

    def _get_priority(self, error):
        return (abs(error) + self.epsilon) ** self.alpha

    def sample(self):
        experiences = []
        idxs = []
        segment = self.tree.total() / self.batch_size
        priorities = []


        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            if isinstance(data, Experience):
                priorities.append(p)
                experiences.append(data)
                idxs.append(idx)
            else:
                print("WHAT THE HECK !!!")

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling])
        return (states, actions, rewards, next_states, dones), idxs, is_weight

    def update(self, idx, error):
        # Not required in normal ReplayBuffer
        self.tree.update(idx, self._get_priority(error))

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.tree)
Esempio n. 17
0
class ReplayBuffer:
    def __init__(self, params):

        buffer_size = params['buffer_size']
        batch_size = params['batch_size']
        mode = params['mode']

        self.__buffer_size = buffer_size
        self.__batch_size = batch_size
        self.__mode = mode

        self.__experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.__memory = SumTree(buffer_size)
        self.__memory_buffer = []

    def get_batch_size(self):
        return self.__batch_size

    def is_ready(self):
        return len(self) >= self.__batch_size

    def add(self, state, action, reward, next_state, done):
        self.__memory_buffer.append(
            self.__experience(state, action, reward, next_state, done))

    def sample(self):

        buf_len = len(self.__memory_buffer)
        mem_len = self.__batch_size - buf_len

        experiences = []
        indices = []
        probs = []

        # if self.__mode['PER']:
        if mem_len:
            #segment = self.__memory.total() / mem_len
            for i in range(mem_len):
                #s = random.uniform(segment * i, segment * (i + 1))
                s = random.uniform(0, self.__memory.total())
                idx, p, e = self.__memory.get(s)
                experiences.append(e)
                indices.append(idx)
                probs.append(p / self.__memory.total())

        for e in self.__memory_buffer:
            # Add experience to the buffer and record its index
            experiences.append(e)
            #if self.__mode['PER']:
            idx = self.__memory.add(0.0, e)  # Default value for p is 0
            indices.append(idx)
            probs.append(1 / len(self))

        self.__memory_buffer.clear()

        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       if e is not None]).astype(np.uint8)).float().to(device)

        return states, actions, rewards, next_states, dones, indices, probs

    def update(self, indices, p_values):
        for idx, p in zip(indices, p_values):
            self.__memory.update(idx, p)

    def __len__(self):
        return max(len(self.__memory), len(self.__memory_buffer))
Esempio n. 18
0
class PrioritizeReplayBuffer(ReplayBuffer):
    """Prioritize experience replay."""
    def __init__(
        self,
        buffer_size,
        batch_size,
        seed,
        beta_start=0.4,
        delta_beta=1e-5,
        alpha=0.6,
        eps=1e-8,
    ):
        """Initialize PER.

        Args:
            buffer_size (int): Size of replay buffer. The actual size will be the
                first power of 2 greater than buffer_size.
            batch_size (int): Size of batches to draw.
            seed (float): Seed.
            beta_start (float): Initial value for beta (importance sampling exponent)
            delta_beta (float): Beta increment at each time step.
            alpha (float): Priority exponent.
            eps (float): Small positive number to avoid unsampling 0 prioritized examples.
        """
        # Depth of sum tree
        depth = int(math.log2(buffer_size)) + 1
        super(PrioritizeReplayBuffer, self).__init__(2**depth, batch_size,
                                                     seed)

        # Initialize sum tree to keep track of the sum of priorities
        self.priorities = SumTree(depth)

        # Current max priority
        self.max_p = 1.0

        # PER Parameters
        self.alpha = alpha
        self.eps = eps
        self.beta = beta_start
        self.delta_beta = delta_beta

    def add(self, state, action, reward, next_state, done):
        """Add transition inside the Replay buffer."""
        # Add in the sum tree with current max priority
        self.priorities.add(self.max_p, self.index)
        super().add(state, action, reward, next_state, done)

    def sample(self):
        """Get sample."""
        # Get indices to sample from sum tree
        # Store these indices to compute importance sampling later
        self.last_indices = self.priorities.sample(self.batch_size)

        # Return transitions corresponding to this indices
        return [self.data[i] for i in self.last_indices]

    def update_priorities(self, td_error):
        """Update priorities."""
        # Compute new priorites
        new_priorities = (abs(td_error) + self.eps)**self.alpha

        # Update sum tree
        self.priorities.update(self.last_indices, new_priorities)

        # Update the current max priority
        self.max_p = max(self.max_p, max(new_priorities))

    def importance_sampling(self):
        """Compute importance sampling weights of last sample."""
        # Get probabilities
        probs = self.priorities.get(
            self.last_indices) / self.priorities.total_sum

        # Compute weights
        weights = (len(self) * probs)**(-self.beta)
        weights /= max(weights)

        # Update beta
        self.beta = min(self.beta + self.delta_beta, 1)

        # Return weights
        return weights
Esempio n. 19
0
class PrioritizedExperienceReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    alpha = 0.6
    beta = 0.4
    beta_increment_per_sample = 0.001
    epsilon = 1e-6

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def compute_priority(self, td_error):
        return (td_error + self.epsilon) ** self.alpha

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        experience = self.experience(state, action, reward, next_state, done)
        max_priority = np.max(self.memory.tree[-self.memory.capacity:])
        if max_priority == 0:
            max_priority = 1.

        self.memory.add(max_priority, experience)

    def update(self, index, td_error):
        priority = self.compute_priority(td_error)
        self.memory.update(index, priority)

    def sample(self):
        """

        :return: importance weights, indices of sampled experiences, and sampled batch of experiences
        """
        self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample)
        segment = self.memory.total() / self.batch_size
        indexes = []
        priorities = []
        experiences = []

        for i in range(self.batch_size):
            # pick a segment
            a = segment * i
            b = segment * (i + 1)
            s = np.random.uniform(a, b)

            index, priority, experience = self.memory.get(s)
            indexes.append(index)
            priorities.append(priority)
            experiences.append(experience)

        sampling_probs = np.divide(priorities, self.memory.total())
        # importance sampling
        i_s_weights = (self.batch_size * sampling_probs) ** -self.beta
        i_s_weights /= np.max(i_s_weights)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        return i_s_weights, indexes, (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return self.memory.count
class PrioritizedReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            seed (int): random seed
        """
        self.memory = SumTree(buffer_size)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

        # epsilon: small amount to avoid zero priority
        # alpha: [0~1] determines how much prioritization is used. with 0, we would get the uniform case
        # beta: Controls importance-sampling compensation. fully compensates for the non-uniform probabilities
        #   when beta=1. The unbiased nature of the updates is most important near convergence at the end of
        #   training, so we define a schedule on the exponent beta that starts from initial value and reaches 1
        #   only at the end of learning.

        self.epsilon = 0.01
        self.alpha = 0.6
        
        beta_start = 0.4
        self.beta_end = 1.0
        self.beta = beta_start
        beta_increments = 200
        self.beta_increment = (self.beta_end - beta_start)/beta_increments

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        experience = self.experience(state, action, reward, next_state, done)
        p = self.memory.max_p()
        if p == 0:
            p = 1.0
        self.memory.add(p=p, data=experience)

    def sample(self, n):
        """Randomly sample a batch of experiences from memory."""
        experiences = []
        indices = []
        priorities = []
        segment = self.memory.total_p() / n
        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, experience) = self.memory.get(s)
            experiences.append(experience)
            indices.append(idx)
            priorities.append(p)
        priorities = np.array(priorities, dtype=np.float64)
        indices = np.array(indices, dtype=np.int32)

        # print(f"priorities: {priorities}")
        probs = priorities / self.memory.total_p()
        # print(f"probs: {probs}")
        # importance-sampling (IS) weights
        w_is = (self.memory.capacity * probs) ** (-self.beta)
        # print(f"w_IS: {w_IS}")
        w_is_normalized = w_is/w_is.max()
        # print(f"w_IS_normalized: {w_IS_normalized}")
        # w_is_normalized = torch.from_numpy(w_is_normalized).float().to(self.device)
        
        return experiences, indices, w_is_normalized

    def update_errors(self, indices, errors):
        priorities = [self._to_priority(e) for e in errors]
        for (idx, p) in zip(indices, priorities):
            self.memory.update(idx, p)

    def _to_priority(self, error):
        return (error + self.epsilon) ** self.alpha
    
    def increase_beta(self):
        if self.beta < self.beta_end:
            self.beta = min(self.beta_end, self.beta + self.beta_increment)

    def __len__(self):
        return len(self.memory)
Esempio n. 21
0
class MemoryDB:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001
    capacity = 100000
    max_priority = 1

    def __init__(self, host_name, db_name, collection_name):
        self.host_name = host_name
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = MongoClient(host_name, 27017)
        self.db = self.client[db_name]
        self.replay_memory_collection = self.db[collection_name]
        self.sum_tree = SumTree(self.capacity)
        memory_priorities = self.replay_memory_collection.find({},
                                                               {"priority": 1})
        for memory_priority in memory_priorities:
            self.sum_tree.add(memory_priority["priority"],
                              {"_id": memory_priority["_id"]})

    def retrieve_by_id(self, id):
        db_experiences = self.replay_memory_collection.find({"_id": id})
        return {
            **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id":
            id
        }

    def _get_priority(self, error):
        return (error + self.e)**self.a

    def add(self, error, experience):
        p = self._get_priority(error)
        experience_to_save = {}
        experience_to_save["terminal"] = experience["terminal"]
        experience_to_save["action_index"] = experience["action_index"]
        experience_to_save["actual_reward"] = experience["actual_reward"]
        experience_to_save["priority"] = self.max_priority
        experience_to_save["binary"] = _pickle.dumps(experience)
        id = self.replay_memory_collection.insert(experience_to_save)

        self.sum_tree.add(p, {"_id": id})

    def add_batch(self, experiences):
        for experience in experiences:
            self.add(self.max_priority, experience)

    def update(self, index, error, experience):
        p = self._get_priority(error)
        self.replay_memory_collection.update_one({"_id": experience["_id"]},
                                                 {"$set": {
                                                     "priority": p
                                                 }})
        self.sum_tree.update(index, p)

    def update_batch(self, indexes, errors, experiences):
        for index, error, experience in zip(indexes, errors, experiences):
            self.update(index, error, experience)

    def get_experiences_size(self):
        return self.replay_memory_collection.count()

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.sum_tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.sum_tree.get(s)
            priorities.append(p)
            experience = self.retrieve_by_id(data["_id"])
            batch.append(experience)
            print(
                "action index: ",
                experience["action_index"],
                "reward: ",
                experience["actual_reward"],
                "priority: ",
                experience["priority"],
            )
            idxs.append(idx)

        sampling_probabilities = priorities / self.sum_tree.total()
        is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight