Beispiel #1
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    def __init__(self,
                 capacity,
                 alpha=0.6,
                 beta=0.4,
                 beta_anneal_step=0.001,
                 epsilon=0.00000001):
        tree_capacity = 1
        while tree_capacity < size:
            tree_capacity *= 2
        self.tree = SumTree(capacity)
        self.capacity = capacity
        self.a = alpha
        self.beta = beta
        self.beta_increment_per_sampling = beta_anneal_step
        self.e = epsilon

    def _get_priority(self, error):
        # Direct proportional prioritization
        return (np.abs(error) + self.e)**self.a

    def add(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            data = 0

            while data == 0:
                s = random.uniform(a, b)
                (idx, p, data) = self.tree.get(s)

            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def step(self):
        self.beta = np.min(
            [1. - self.e, self.beta + self.beta_increment_per_sampling])

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)
Beispiel #2
0
class MemoryDB:  # stored as ( s, a, r, s_ ) in SumTree
    def __init__(self, e, a, beta, beta_increment_per_sampling, capacity,
                 max_priority):
        self.capacity = capacity
        self.e = e
        self.a = a
        self.beta = beta
        self.beta_increment_per_sampling = beta_increment_per_sampling
        self.capacity = capacity
        self.max_priority = max_priority
        self.sum_tree = SumTree(self.capacity)

    def _get_priority(self, error):
        return min((self.max_priority, (error + self.e)**self.a))

    def add(self, experience, error=None):
        p = self._get_priority(error) if error != None else self.max_priority
        self.sum_tree.add(p, experience)

    def add_batch(self, experiences):
        for experience in experiences:
            self.add(experience, self.max_priority)

    def update(self, index, error, experience):
        p = self._get_priority(error)
        self.sum_tree.update(index, p)

    def update_batch(self, indexes, errors, experiences):
        for index, error, experience in zip(indexes, errors, experiences):
            self.update(index, error, experience)

    def get_experiences_size(self):
        return self.sum_tree.getCount()

    def sample(self, n):

        batch = []
        idxs = []
        segment = self.sum_tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            s = random.uniform(a, b)
            (idx, p, data) = self.sum_tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.sum_tree.total()
        is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()
        return batch, idxs, is_weight
Beispiel #3
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    e = 1e-10
    a = 0.5
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (error + self.e)**self.a

    def append(self, data):
        error, sample = data
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / (self.tree.total() + 1e-10)
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def __len__(self):
        return self.tree.n_entries
Beispiel #4
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, buffer_size, alpha):
        self.capacity = buffer_size
        self.tree = SumTree(buffer_size)
        self.alpha = alpha
        self.max_priority = 1
        #self.beta_initial = ??
        #self.beta_steps = ??

    def add(self, experience):
        self.tree.add(self.max_priority, experience)

    def update(self, index, experience, td_error):
        priority = (abs(td_error) + 0.0001)**self.alpha
        self.tree.update(index, priority)
        if self.max_priority < priority:
            self.max_priority = priority

    def sample(self, batch_size):
        indexes = []
        batchs = []
        total = self.tree.total()
        section = total / batch_size
        for i in range(batch_size):
            r = section * i + np.random.random() * section
            (idx, priority, experience) = self.tree.get(r)
            indexes.append(idx)  # 後のpriority更新に使う
            batchs.append(experience)
        return (indexes, batchs)
Beispiel #5
0
class Memory:
    e = 0.01
    a = 0.6

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)
Beispiel #6
0
class PriorityMemory:   

    def __init__(self, capacity):
        """
        Instantiate a priority based memory with capable of holding
        capacity experiences. Memories are sampled with frequency
        based on their priority.
        """
        # Circular buffer array based tree with priorities as node values.
        self.tree = SumTree(capacity)
        self.e = 0.01 # Small constant to ensure all priorities > 0
        self.a = 0.6  # Constant to control the weight of error on priority

    def _getPriority(self, error):
        """
        Convert error to a priority based on the constants "e" and "a"
        """
        return (error + self.e) ** self.a

    def add(self, experience, error):
        """
        Add an experience to memory
        """
        p = self._getPriority(error)
        self.tree.add(p, experience) 

    def sample(self, n):
        """
        Sample n experiences from memory. Experiences selection
        frequency is based on priority.

        Returns:
            - mini_batch: Sequence containing the experiences.
            - indicies: The index of the node associated with each experience 
              so that its priority can be updated.
        """
        mini_batch = []
        indicies = []

        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, _, experience) = self.tree.get(s)
            mini_batch.append(experience)
            indicies.append(idx)

        return mini_batch, indicies

    def update(self, idx, error):
        """
        Update the priority associated with a memory.
        """
        p = self._getPriority(error)
        self.tree.update(idx, p)
Beispiel #7
0
class PrioritizedMemory:
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (np.abs(error) + self.e)**self.a

    def push(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)
Beispiel #8
0
class PrioritizeReplayBuffer(ReplayBuffer):
    # Based on https://github.com/y-kamiya/machine-learning-samples/blob/7b6792ce37cc69051e9053afeddc6d485ad34e79/python3/reinforcement/dqn/agent.py
    EPSILON = 0.0001
    ALPHA = 0.6
    BETA = 0.4
    size = 0

    def __init__(self, capacity):
        super().__init__(capacity=capacity)
        self.td_error_epsilon = 0.0001
        self.tree = SumTree(capacity)

    def __len__(self):
        return self.size

    def _getPriority(self, td_error):
        return (td_error + self.EPSILON)**self.ALPHA

    def push(self, state, action, done, next_state, reward, p_index):
        self.size += 1
        transition = self.Transition(state, action, done, next_state, reward,
                                     p_index)
        priority = self.tree.max()
        if priority <= 0:
            priority = 1
        self.tree.add(priority, transition)

    def sample(self, batch_size, episode):
        list = []
        indexes = []
        weights = np.empty(batch_size, dtype='float32')
        total = self.tree.total()
        beta = self.BETA + (
            1 - self.BETA) * episode  #episode / self.config.num_episodes

        for i, rand in enumerate(np.random.uniform(0, total, batch_size)):
            (idx, priority, data) = self.tree.get(rand)
            list.append(data)
            indexes.append(idx)
            weights[i] = (self.capacity * priority / total)**(-beta)
        return (indexes, list, weights / weights.max())

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)
Beispiel #9
0
class PERMemory:
    EPSILON = 0.0001
    ALPHA = 0.5
    BETA = 0.4
    size = 0

    def __init__(self, config, capacity):
        self.config = config
        self.capacity = capacity
        self.tree = SumTree(capacity)

    def _getPriority(self, td_error):
        return (td_error + self.EPSILON) ** self.ALPHA

    def push(self, transition):
        self.size += 1

        priority = self.tree.max()
        if priority <= 0:
            priority = 1

        self.tree.add(priority, transition)

    def sample(self, size, episode):
        list = []
        indexes = []
        weights = np.empty(size, dtype='float32')
        total = self.tree.total()
        beta = self.BETA + (1 - self.BETA) * episode / self.config.num_episodes
        beta = min(1.0, beta)

        for i, rand in enumerate(np.random.uniform(0, total, size)):
            (idx, priority, data) = self.tree.get(rand)
            list.append(data)
            indexes.append(idx)
            weights[i] = (self.capacity * priority / total) ** (-beta)

        return (indexes, list, weights / weights.max())

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)

    def __len__(self):
        return self.size
Beispiel #10
0
class Memory(object):
    e = 0.05

    def __init__(self, capacity, pr_scale):
        self.capacity = capacity
        self.memory = ST(self.capacity)
        self.pr_scale = pr_scale
        self.max_pr = 0

    def get_priority(self, error):
        return (error + self.e)**self.pr_scale

    def remember(self, sample, error):
        p = self.get_priority(error)

        self_max = max(self.max_pr, p)
        self.memory.add(self_max, sample)

    def sample(self, n):
        sample_batch = []
        sample_batch_indices = []
        sample_batch_priorities = []
        num_segments = self.memory.total() / n

        for i in range(n):
            left = num_segments * i
            right = num_segments * (i + 1)

            s = random.uniform(left, right)
            idx, pr, data = self.memory.get(s)
            sample_batch.append((idx, data))
            sample_batch_indices.append(idx)
            sample_batch_priorities.append(pr)

        return [sample_batch, sample_batch_indices, sample_batch_priorities]

    def update(self, batch_indices, errors):
        for i in range(len(batch_indices)):
            p = self.get_priority(errors[i])
            self.memory.update(batch_indices[i], p)
Beispiel #11
0
class Replay_Memory:
    def __init__(self):
        global MEMORY_LEN
        self.tree = SumTree(MEMORY_LEN)

    def add(self, error, sample):
        global MEMORY_BIAS, MEMORY_POW
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.add(priority, sample)

    def sample(self):
        """
         Get a sample batch of the replay memory
        Returns:
         batch: a batch with one sample from each segment of the memory
        """
        global BATCH_SIZE
        batch = []
        #we want one representative of all distribution-segments in the batch
        #e.g BATCH_SIZE=2: batch contains one sample from [min,median]
        #and from [median,max]
        segment = self.tree.total() / BATCH_SIZE
        for i in range(BATCH_SIZE):
            minimum = segment * i
            maximum = segment * (i + 1)
            s = random.uniform(minimum, maximum)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        """
         Updates one entry in the replay memory
        Args:
         idx: the position of the outdated transition in the memory
         error: the newly calculated error
        """
        priority = (error + MEMORY_BIAS)**MEMORY_POW
        self.tree.update(idx, priority)
Beispiel #12
0
class ReplayMemory(object):
    def __init__(self, max_size, alpha, eps):
        self.max_size = max_size
        self.alpha = alpha
        self.eps = eps

        self.tree = SumTree(max_size)
        self.last_idxs = None
        self.size = 0

    def get_batch(self, batch_size):
        self.last_idxs = []

        ret = []
        for i in range(min(batch_size, self.size)):
            s = random.random() * self.tree.total()

            idx, _, data = self.tree.get(s)

            ret.append(pickle.loads(zlib.decompress(data)))
            self.last_idxs.append(idx)

        return ret

    def update(self, losses):
        for i in range(len(self.last_idxs)):
            self.tree.update(self.last_idxs[i],
                             math.pow(losses[i] + self.eps, self.alpha))

    def add_element(self, new_el, loss):
        self.size = min(self.max_size, self.size + 1)

        p = math.pow(loss + self.eps, self.alpha)
        self.tree.add(p, zlib.compress(pickle.dumps(new_el)))

    def __len__(self):
        return self.size
Beispiel #13
0
class ReplayBuffer:
    def __init__(self, params):

        buffer_size = params['buffer_size']
        batch_size = params['batch_size']
        mode = params['mode']

        self.__buffer_size = buffer_size
        self.__batch_size = batch_size
        self.__mode = mode

        self.__experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.__memory = SumTree(buffer_size)
        self.__memory_buffer = []

    def get_batch_size(self):
        return self.__batch_size

    def is_ready(self):
        return len(self) >= self.__batch_size

    def add(self, state, action, reward, next_state, done):
        self.__memory_buffer.append(
            self.__experience(state, action, reward, next_state, done))

    def sample(self):

        buf_len = len(self.__memory_buffer)
        mem_len = self.__batch_size - buf_len

        experiences = []
        indices = []
        probs = []

        # if self.__mode['PER']:
        if mem_len:
            #segment = self.__memory.total() / mem_len
            for i in range(mem_len):
                #s = random.uniform(segment * i, segment * (i + 1))
                s = random.uniform(0, self.__memory.total())
                idx, p, e = self.__memory.get(s)
                experiences.append(e)
                indices.append(idx)
                probs.append(p / self.__memory.total())

        for e in self.__memory_buffer:
            # Add experience to the buffer and record its index
            experiences.append(e)
            #if self.__mode['PER']:
            idx = self.__memory.add(0.0, e)  # Default value for p is 0
            indices.append(idx)
            probs.append(1 / len(self))

        self.__memory_buffer.clear()

        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       if e is not None]).astype(np.uint8)).float().to(device)

        return states, actions, rewards, next_states, dones, indices, probs

    def update(self, indices, p_values):
        for idx, p in zip(indices, p_values):
            self.__memory.update(idx, p)

    def __len__(self):
        return max(len(self.__memory), len(self.__memory_buffer))
Beispiel #14
0
class PrioritisedReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, alpha, epsilon):
        self.action_size = action_size
        self.tree = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.alpha = alpha
        self.epsilon = epsilon

    def add(self, error, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        p = self._get_priority(error)
        self.tree.add(p, e)

    def sample(self, beta):
        segment = self.tree.total(
        ) / self.batch_size  # split into segments so we don't end up with duplicates innit

        experiences = []
        priorities = []
        idxs = []

        for i in range(self.batch_size):
            start = segment * i
            end = segment * (i + 1)
            s = random.uniform(start, end)
            idx, p, e = self.tree.get(s)
            if e:
                priorities.append(p)
                experiences.append(e)
                idxs.append(idx)

        probs = priorities / self.tree.total()  # big P
        weights = np.power(self.tree.n_entries * probs, -beta)
        weights /= weights.max()  # scale so max weight is 1

        states = torch.from_numpy(np.vstack([e.state for e in experiences
                                             ])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences
                                              ])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences
                                              ])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       ]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(weights).float().to(device)

        return (states, actions, rewards, next_states, dones, weights, idxs)

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def _get_priority(self, error):
        return (np.abs(error) + self.epsilon)**self.alpha

    def __len__(self):
        """Return the current size of internal memory."""
        return self.tree.n_entries
class PrioritisedReplayBuffer():
    """A prioritised replay buffer.

    Creates a sum tree and uses it to stores a fixed number of experience tuples. When sampled
    experiences are returned with greater priority given to those with the highest absolute TD-error.
    """
    def __init__(self,
                 buffer_size,
                 alpha,
                 beta_zero,
                 beta_increment_size=0.001,
                 epsilon=0.1,
                 max_priority=1.,
                 seed=None):
        """Priority replay buffer initialiser.

        Args:
            buffer_size (int): capacity of the replay buffer.
            alpha (float): priority scaling hyperparameter.
            beta_zero (float): importance sampling scaling hyperparameter.
            beta_increment_size (float): beta annealing rate.
            epsilon (float): base priority to ensure non-zero sampling probability.
            max_priority (float): initial maximum priority.
            seed (int): seed for random number generator
       """
        random.seed(seed)

        self.sum_tree = SumTree(buffer_size)
        self.memory = {}
        self.experience = namedtuple(
            "experience", ["state", "action", "reward", "next_state", "done"])
        self.buffer_size = buffer_size
        self.beta_increment_size = beta_increment_size
        self.max_priority = max_priority**alpha
        self.min_priority = max_priority**alpha
        self.last_min_update = 0

        self.alpha = alpha
        self.beta = beta_zero
        self.epsilon = epsilon

    def add(self, state, action, reward, next_state, done):
        """Creates experience tuple and adds it to the replay buffer."""
        experience = self.experience(state, action, reward, next_state, done)
        current_tree_idx = self.sum_tree.input_pointer
        self.memory[current_tree_idx] = experience
        self.sum_tree.add(self.max_priority)

    def sample(self, batch_size):
        """Returns a batch of experiences sampled according to their priority."""
        idx_list = []
        weights = []
        states = []
        actions = []
        rewards = []
        next_states = []
        done_list = []

        segment = self.sum_tree.total() / batch_size
        sample_list = [
            random.uniform(segment * i, segment * (i + 1))
            for i in range(batch_size)
        ]
        max_weight = self.min_priority**(-self.beta)

        for s in sample_list:
            idx, priority = self.sum_tree.sample(s)
            idx_list.append(idx)
            weight = priority**(-self.beta) / max_weight
            weights.append(weight)

            sample = self.memory[idx]
            state, action, reward, next_state, done = sample
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            done_list.append(done)

        return states, actions, rewards, next_states, done_list, idx_list, weights

    def update(self, idx_list, td_error):
        """Updates a specifics experience's priority."""
        priority_list = (td_error + self.epsilon)**self.alpha

        self.max_priority = max(self.max_priority, priority_list.max())
        list_min_priority = priority_list.min()

        if list_min_priority <= self.min_priority:
            self.min_priority = list_min_priority
            self.last_min_update = 0
        else:
            self.last_min_update += 1

        if self.last_min_update >= self.buffer_size:
            self.min_priority = np.array([
                node.val
                for node in self.sum_tree.tree_array[-self.buffer_size:]
            ]).min()
            self.last_min_update = 0

        for i, idx in enumerate(idx_list):
            priority = min(self.max_priority, priority_list[i])
            self.sum_tree.update(idx, priority)

        self.beta = min(1, self.beta + self.beta_increment_size)

    def __len__(self, ):
        """Return number of experiences in the replay buffer."""
        return len(self.memory)
Beispiel #16
0
class PriorityBuffer:
    # Inspired by implementation from: https://github.com/rlcode/per/blob/master/prioritized_memory.py

    def __init__(self, action_size, agent_config):
        """Initialize a PriorityBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            a (float): amount of uniformity in the sampling (0 == uniform, 1. == priority only)
            beta_start (float): start of beta value for prioritised buffer
            beta_max_steps (int): max number of steps to reach beta value of 1.
        """
        self.action_size = action_size
        self.tree = SumTree(capacity=agent_config.buffer_size)
        self.batch_size = agent_config.batch_size
        # self.seed = random.seed(buffer_config.seed)
        self.epsilon = agent_config.buffer_epsilon
        # how much randomness we require a = 0 (pure random) a = 1 (only priority)
        self.alpha = agent_config.alpha
        self.beta = agent_config.beta_start
        self.beta_start = agent_config.beta_start
        self.beta_end = agent_config.beta_end
        self.beta_increment_per_sampling = (self.beta_end - self.beta_start) / agent_config.beta_max_steps

    def add(self, sample, error):
        """Add a new experience to memory."""
        p = self._get_priority(error)
        state, action, reward, next_state, done = sample
        e = Experience(state, action, reward, next_state, done)
        self.tree.add(p, e)

    def _get_priority(self, error):
        return (abs(error) + self.epsilon) ** self.alpha

    def sample(self):
        experiences = []
        idxs = []
        segment = self.tree.total() / self.batch_size
        priorities = []


        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            if isinstance(data, Experience):
                priorities.append(p)
                experiences.append(data)
                idxs.append(idx)
            else:
                print("WHAT THE HECK !!!")

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        self.beta = np.min([self.beta_end, self.beta + self.beta_increment_per_sampling])
        return (states, actions, rewards, next_states, dones), idxs, is_weight

    def update(self, idx, error):
        # Not required in normal ReplayBuffer
        self.tree.update(idx, self._get_priority(error))

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.tree)
Beispiel #17
0
class PrioritizedExperienceReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    alpha = 0.6
    beta = 0.4
    beta_increment_per_sample = 0.001
    epsilon = 1e-6

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = SumTree(buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def compute_priority(self, td_error):
        return (td_error + self.epsilon) ** self.alpha

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        experience = self.experience(state, action, reward, next_state, done)
        max_priority = np.max(self.memory.tree[-self.memory.capacity:])
        if max_priority == 0:
            max_priority = 1.

        self.memory.add(max_priority, experience)

    def update(self, index, td_error):
        priority = self.compute_priority(td_error)
        self.memory.update(index, priority)

    def sample(self):
        """

        :return: importance weights, indices of sampled experiences, and sampled batch of experiences
        """
        self.beta = np.minimum(1., self.beta + self.beta_increment_per_sample)
        segment = self.memory.total() / self.batch_size
        indexes = []
        priorities = []
        experiences = []

        for i in range(self.batch_size):
            # pick a segment
            a = segment * i
            b = segment * (i + 1)
            s = np.random.uniform(a, b)

            index, priority, experience = self.memory.get(s)
            indexes.append(index)
            priorities.append(priority)
            experiences.append(experience)

        sampling_probs = np.divide(priorities, self.memory.total())
        # importance sampling
        i_s_weights = (self.batch_size * sampling_probs) ** -self.beta
        i_s_weights /= np.max(i_s_weights)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
            device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
            device)

        return i_s_weights, indexes, (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return self.memory.count
Beispiel #18
0
class MemoryDB:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001
    capacity = 100000
    max_priority = 1

    def __init__(self, host_name, db_name, collection_name):
        self.host_name = host_name
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = MongoClient(host_name, 27017)
        self.db = self.client[db_name]
        self.replay_memory_collection = self.db[collection_name]
        self.sum_tree = SumTree(self.capacity)
        memory_priorities = self.replay_memory_collection.find({},
                                                               {"priority": 1})
        for memory_priority in memory_priorities:
            self.sum_tree.add(memory_priority["priority"],
                              {"_id": memory_priority["_id"]})

    def retrieve_by_id(self, id):
        db_experiences = self.replay_memory_collection.find({"_id": id})
        return {
            **_pickle.loads(db_experiences[0]['binary'], encoding='latin1'), "_id":
            id
        }

    def _get_priority(self, error):
        return (error + self.e)**self.a

    def add(self, error, experience):
        p = self._get_priority(error)
        experience_to_save = {}
        experience_to_save["terminal"] = experience["terminal"]
        experience_to_save["action_index"] = experience["action_index"]
        experience_to_save["actual_reward"] = experience["actual_reward"]
        experience_to_save["priority"] = self.max_priority
        experience_to_save["binary"] = _pickle.dumps(experience)
        id = self.replay_memory_collection.insert(experience_to_save)

        self.sum_tree.add(p, {"_id": id})

    def add_batch(self, experiences):
        for experience in experiences:
            self.add(self.max_priority, experience)

    def update(self, index, error, experience):
        p = self._get_priority(error)
        self.replay_memory_collection.update_one({"_id": experience["_id"]},
                                                 {"$set": {
                                                     "priority": p
                                                 }})
        self.sum_tree.update(index, p)

    def update_batch(self, indexes, errors, experiences):
        for index, error, experience in zip(indexes, errors, experiences):
            self.update(index, error, experience)

    def get_experiences_size(self):
        return self.replay_memory_collection.count()

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.sum_tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.sum_tree.get(s)
            priorities.append(p)
            experience = self.retrieve_by_id(data["_id"])
            batch.append(experience)
            print(
                "action index: ",
                experience["action_index"],
                "reward: ",
                experience["actual_reward"],
                "priority: ",
                experience["priority"],
            )
            idxs.append(idx)

        sampling_probabilities = priorities / self.sum_tree.total()
        is_weight = np.power(self.sum_tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight