Example #1
0
class PriorityReplayMemory(ReplayMemory):
    def __init__(self, size, alpha):
        super().__init__(size)
        self.alpha = alpha

        tree_cap = 1
        while tree_cap < size:
            tree_cap *= 2

        self.sum_tree = SumSegmentTree(tree_cap)
        self.max_priority = 1.0

    def add_mem(self, *args, **kwargs):
        idx = self.next_idx
        super().add_mem(*args, **kwargs)
        prio = self.max_priority**self.alpha
        self.sum_tree[idx] = prio

    def _sample_idxs(self, size):
        idxs = []
        for _ in range(size):
            mass = random.random() * self.sum_tree.sum(0, len(self.memory) - 1)
            idx = self.sum_tree.find_prefixsum_idx(mass)
            idxs.append(idx)
        return idxs

    def get_batch(self, size, beta):
        assert beta > 0
        idxs = self._sample_idxs(size)

        # create weights:
        weights = []
        tot_sum = self.sum_tree.sum()

        #p_min = tot_min / tot_sum
        #max_w = (p_min * len(self.memory)) ** (-beta)

        for idx in idxs:
            p_sample = self.sum_tree[idx] / tot_sum
            weight = (p_sample * len(self.memory))**(-beta)
            weights.append(weight)

        #print(max_w)
        #print(weights)
        weights = np.array(weights)
        weights /= max(weights)

        batch = [self.memory[idx] for idx in idxs]
        return batch, weights, idxs

    def update_priorities(self, idxs, priorities):
        for idx, prio in zip(idxs, priorities):
            assert prio > 0
            prio_a = prio**self.alpha
            self.sum_tree[idx] = prio_a

            self.max_priority = max(self.max_priority, prio)
Example #2
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #3
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, memory_size=1000000, alpha=0.5, seed=None):
        '''
        Prioritized replay buffer from https://arxiv.org/pdf/1511.05952.pdf
        This implementation is based on the OpenAI sumtree implemenation which can be found here
        https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
        
        memory_size: int
            maximum number of experiences to store
            
        alpha: float, [0.0, 1.0]
            hyperparameter that controls the amount of prioritization, with 0.0 being 
            no prioritization (the uniform case)
            
        seed: None or int
            random seed for the replay buffer
        '''
        super().__init__(memory_size=memory_size, seed=seed)
        self.alpha = alpha

        it_capacity = 1
        while it_capacity < self._memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, experience):
        '''
        Add an experience to the replay buffer
        
        experience: object, usually a tuple
            the experience to store in the replay buffer
            this implementation does not specify a form for the experience as all that is handled by the DQN agent
        '''
        index = self._next_index
        super().add(experience)
        self._it_sum[index] = self._max_priority**self.alpha
        self._it_min[index] = self._max_priority**self.alpha

    def _sample_proportional(self, batch_size):
        '''
        Function to use sample from the replay buffer with proportional prioritization
        All code here is from the OpenAI implementation to correctly make use of the
        sum-tree data structure
        
        batch_size: int
            then number of experience to sample
            
        res: list
            list of indices of the experiences sampled from the replay buffer
        '''
        res = []
        p_total = self._it_sum.sum(0, len(self._memory) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            index = self._it_sum.find_prefixsum_idx(mass)
            res.append(index)

        return res

    def sample(self, batch_size, beta=1.0):
        '''
        Sample from the replay buffer with proportional prioritization
        
        batch_size: int
            then number of experience to sample
            
        samples: list of 3-tuples
            list of sampled experiences, importance sampling weights for each experience, and
            the indices of the experiences (used to update priorities) in the form
            (experience, is_weights, indices)
        '''
        indices = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._memory))**(-beta)

        samples = []
        for i in indices:
            p_sample = self._it_sum[i] / self._it_sum.sum()
            is_weight = ((p_sample * len(self._memory))**(-beta)) / max_weight
            experience = self._memory[i]

            sample = (experience, is_weight, i)
            samples.append(sample)

        return samples

    def update_priorities(self, indices, priorities):
        '''
        Update the priorities for the experiences corresponding to the given indices
        
        indices: list-like
            list of indices for the experiences/priorities to update
            
        priorities: list-like
            list of new priorities corresponding to the given indices
        '''
        for i, priority in zip(indices, priorities):
            self._it_sum[i] = priority**self.alpha
            self._it_min[i] = priority**self.alpha
            self._max_priority = max(self._max_priority, priority)
Example #4
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #5
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """
        Prioritied Experience Replay 
      
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        # I don't understand purpose of this
        # maybe to create a graph to store ranked truples?
        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._buffer) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._buffer))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._buffer))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """
        set priority of transition at index idxes[i] in buffer to priorities[i]
        """

        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._buffer)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #6
0
class ProportionalReplay(ExperienceReplay):
    def __init__(self, size, alpha):
        super(ProportionalReplay, self).__init__(size)
        assert alpha >= 0
        self.alpha = alpha

        self.tree_size = 1
        while self.tree_size < self.maxsize:
            self.tree_size *= 2

        self.min_tree = MinSegmentTree(
            self.tree_size)  # for calculating maximum IS weight
        self.sum_tree = SumSegmentTree(
            self.tree_size)  # for proportional sampling
        self.max_priority = 1.0  # maximum priority we've seen so far. will be updated

    def add(self, experience):
        idx = self.next_idx  # save idx before it's changed in super call
        super().add(
            experience)  # put experience data (s,a,r,s',done) in buffer

        # give new experience max priority to ensure it's replayed at least once
        self.min_tree[idx] = self.max_priority**self.alpha
        self.sum_tree[idx] = self.max_priority**self.alpha

    # To sample a minibatch of size k, the range [0, p_total] is divided equally into k ranges.
    # Next, a value is uniformly sampled from each range.
    def sample_proportional(self, batch_size):
        idxs = []
        p_total = self.sum_tree.sum(
            0,
            len(self.buffer) -
            1)  # sum of the priorities of all experience in the buffer
        every_range_len = p_total / batch_size  # length of every range over [0,p_total] (batch_size = k)
        for i in range(batch_size):  # for each range
            mass = self.np_random.uniform(
            ) * every_range_len + i * every_range_len  # uniformly sampling a probability mass from this range
            idx = self.sum_tree.find_prefixsum_idx(
                mass
            )  # get smallest experience index s.t. cumulative dist F(idx) >= mass
            idxs.append(idx)
        return idxs

    # sample batch of experiences along with their weights and indices
    def sample(self, batch_size, beta):
        assert beta > 0
        idxs = self.sample_proportional(
            batch_size)  # sampled experience indices

        weights = []
        p_min = self.min_tree.min() / self.sum_tree.sum(
        )  # minimum possible priority for a transition
        max_weight = (p_min * len(self.buffer))**(
            -beta)  # (p_uniform/p_min)^beta is maximum possible IS weight

        # get IS weights for sampled experience
        for idx in idxs:
            p_sample = self.sum_tree[idx] / self.sum_tree.sum(
            )  # normalize sampled priority
            weight = (p_sample * len(self.buffer))**(
                -beta)  # (p_uniform/p_sample)^beta. IS weight
            weights.append(
                weight / max_weight
            )  # weights normalized by max so that they only scale the update downwards
        weights = np.array(weights)

        encoded_sample = self.encode_samples(
            idxs)  # collect experience at given indices
        return tuple(list(encoded_sample) + [weights, idxs])

    # set the priorities of experiences at given indices
    def update_priorities(self, idxs, priorities):
        assert len(idxs) == len(priorities)
        for idx, priority in zip(idxs, priorities):
            assert priority > 0
            assert 0 <= idx < len(self.buffer)
            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)
Example #7
0
class PrioritizedReplayBuffer(ReplayBuffer):
    """Fixed-size prioritized buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"):
        """Initialize a PrioritizedReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization)
            beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction)
        """
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device)

        self.alpha = alpha
        self.beta = beta
        self._eps = 0.00000001

        it_capacity = 1
        while it_capacity < buffer_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        idx = self._next_idx
        super().add(state, action, reward, next_state, done)

        self._it_sum[idx] = self._max_priority ** self.alpha
        self._it_min[idx] = self._max_priority ** self.alpha

    def _sample_proportional(self):
        res = []
        p_total = self._it_sum.sum(0, len(self.memory) - 1)
        every_range_len = p_total / self.batch_size
        for i in range(self.batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self):
        idxes = self._sample_proportional()

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self.memory) + self._eps) ** (-self.beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self.memory) + self._eps) ** (-self.beta)
            weights.append(weight / max_weight)

        weights = torch.tensor(weights, device=self.device, dtype=torch.float)

        states = torch.from_numpy(np.vstack([self.memory[i].state for i in idxes])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([self.memory[i].action for i in idxes])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([self.memory[i].reward for i in idxes])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([self.memory[i].next_state for i in idxes])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([self.memory[i].done for i in idxes]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones, idxes, weights)

    def update_priorities(self, indexes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index indexes[i] in buffer
        to priorities[i].
        Parameters
        ----------
        indexes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        for idx, priority in zip(indexes, priorities):
            self._it_sum[idx] = priority ** self.alpha
            self._it_min[idx] = priority ** self.alpha

            self._max_priority = max(self._max_priority, priority)
Example #8
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self,
                 size,
                 state_shape,
                 alpha,
                 n_batch_trajectories,
                 n_trajectory_steps,
                 n_emus=1):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        dsize: int
            Max number of demonstration transitions. These are retained in the 
            buffer permanently.
            https://arxiv.org/abs/1704.03732
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size,
                                                      state_shape,
                                                      n_batch_trajectories,
                                                      n_trajectory_steps,
                                                      n_emus=n_emus)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < self._size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.add_effect"""
        idx = super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            mass = random.random() * self._it_sum.sum(0, self._size - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return np.array(res)

    def _compute_weights(self, idxes, beta):
        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * self._size)**(-beta)

        for idx in idxes:
            if idx < 0:
                weights.append(0.0)
                continue
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * self._size)**(-beta)
            weights.append(weight / max_weight)

        return np.array(weights)

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)
        batch_samples = self._retrieve_samples(idxes)
        weights = self._compute_weights(idxes, beta)
        return tuple(list(batch_samples) + [weights, idxes])

    def sample_nstep(self, beta):
        """Sample a (self.n_batch_trajectories, self.n_trajectory_steps, n_s) batch of states, where n_s is the dimension of the 
         state vector

        Compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)
        n_step: int
            How many steps to look into the future

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        n_step_rewards_batch: np.array
            n-step rewards vector batch
        tpn_obs_batch: np.array
            tpn set of observations
        n_tpn_step_batch: np.array
            n in n-step indicator to indicate if trajectory sampled 
            is unfinished or done -- trajectory is unfinished if 
            there are no more transitions to cover all n steps
        n_step_done_mask: np.array
            n_step_done_mask[i] = 1 if trajectory sampled reaches 
            the end of an episode, and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        traj_idxes = self._sample_proportional(self.n_batch_trajectories)
        batched_trajectories, idxes = self._retrieve_n_step_trajectories(
            traj_idxes)

        #for i in idxes:
        #    if i == -1: continue
        #    assert self.traj_ids[i] > -1

        weights = self._compute_weights(idxes, beta)
        return batched_trajectories + (weights, idxes)

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            if idx < 0:
                continue
            assert priority > 0
            assert 0 <= idx < self._size
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha
            self._max_priority = max(self._max_priority, priority)
Example #9
0
class PriorityBuffer(BufferBase):
    def __init__(self, capacity, gamma=0.99, n_steps=2, alpha=0.5):
        super(PriorityBuffer, self).__init__(capacity, gamma, n_steps)
        self.buffer = []
        self.position = 0
        self.alpha = alpha
        it_cap = 1
        while it_cap < capacity:
            it_cap *= 2
        self._it_sum = SumSegmentTree(it_cap)
        self._it_min = MinSegmentTree(it_cap)
        self._max_priority = 1.0

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        position = self.position
        if len(self.buffer) < self.capacity:
            self.buffer.append(experience)
        else:
            self.buffer[position] = experience
        self.position = (position + 1) % self.capacity
        self._it_sum[position] = self._max_priority**self.alpha
        self._it_min[position] = self._max_priority**self.alpha

    def _sample_proportional(self, batch_size):
        total = self._it_sum.sum(0, len(self.buffer) - (1 + self.n_steps))
        mass = np.random.random(size=batch_size) * total
        idx = self._it_sum.find_prefix_sum_idx(mass)
        return idx

    def sample(self, batch_size, beta=0.4):
        assert beta > 0
        indices = self._sample_proportional(batch_size)

        states = []
        actions = []
        rewards = []
        dones = []
        next_states = []
        for index in indices:
            current_buffer = self.buffer[index]
            current_state = current_buffer[0]
            current_action = current_buffer[1]
            current_done = current_buffer[3]
            reward = current_buffer[2]
            next_state = self.buffer[index + self.n_steps][0]
            for sub_index in range(1, self.n_steps):
                reward += self.buffer[index + sub_index][2] * (self.gamma**
                                                               sub_index)
                if self.buffer[index + sub_index][3]:
                    break
            states.append(ar(current_state, dtype=np.float32))
            actions.append(current_action)
            rewards.append(reward)
            dones.append(current_done)
            next_states.append(ar(next_state, dtype=np.float32))

        sm = self._it_sum.sum()
        p_min = self._it_min.min() / sm
        max_weight = (p_min * len(self.buffer))**(-beta)
        p_sample = self._it_sum[indices] / sm
        weights = (p_sample * len(self.buffer))**(-beta) / max_weight

        states_np = np.stack(states, 0) / 255.0
        next_states_np = np.stack(next_states, 0) / 255.0

        return states_np, ar(actions), ar(rewards, dtype=np.float32), ar(dones, dtype=np.uint8), next_states_np, \
               ar(weights, dtype=np.float32), indices

    def update_weights(self, batch_indices, batch_priorities):
        assert len(batch_indices) == len(batch_priorities)
        assert np.min(batch_priorities) > 0
        assert np.min(batch_indices) >= 0
        for idx, prio in zip(batch_indices, batch_priorities):
            idx = int(idx)
            self._it_sum[idx] = prio**self.alpha
            self._it_min[idx] = prio**self.alpha
        self._max_priority = max(self._max_priority, np.max(batch_priorities))
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6):
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed)
        
        #capacity must be positive and a power of 2
        tree_capacity = 1
        while tree_capacity < self.buffer_size:
            tree_capacity *= 2
        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha
        
    def add(self, state, action, reward, next_state, done):
        
        self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
        self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
        super().add(state, action, reward, next_state, done)
        self.tree_ptr = (self.tree_ptr + 1) % self.buffer_size
        
#         if self.tree_ptr == self.buffer_size-1:
#             for i in range(0, self.buffer_size-1):
#                 self.sum_tree[i] = self.sum_tree[i+1] 
#                 self.min_tree[i] = self.min_tree[i+1]
#             self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
#             self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
#         else:

#         
        
    def sample(self, beta=0.4):
        indices = self._sample_proportional()
        
        indices = [index for index in indices if index<len(self.memory)]
        states = torch.from_numpy(np.vstack([self.memory[index].state for index in indices])).float().to(device)
        actions = torch.from_numpy(np.vstack([self.memory[index].action for index in indices])).long().to(device)
        rewards = torch.from_numpy(np.vstack([self.memory[index].reward for index in indices])).float().to(device)
        next_states = torch.from_numpy(np.vstack([self.memory[index].next_state for index in indices])).float().to(device)
        dones = torch.from_numpy(np.vstack([self.memory[index].done for index in indices]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(np.vstack([self._cal_weight(index, beta) for index in indices])).float().to(device)
         
        return (states, actions, rewards, next_states, dones, weights, indices)
        
    def update_priority(self, indices, loss_for_prior):
        for idx, priority in zip(indices, loss_for_prior):
            self.sum_tree[idx] = priority ** self.alpha
            self.min_tree[idx] = priority ** self.alpha
            
            self.max_priority = max(self.max_priority, priority)
        
    def _sample_proportional(self):
        indices = []
        p_total = self.sum_tree.sum() #sum(0, len(self.memory)-1)
        segment = p_total / self.batch_size
        
        for i in range(self.batch_size):
            start = segment * i
            end = start + segment
            upper = random.uniform(start, end)
            index = self.sum_tree.retrieve(upper)
            indices.append(index)
        return indices
    
    def _cal_weight(self, index, beta):
        sum_priority = self.sum_tree.sum()
        min_priority = self.min_tree.min()
        current_priority = self.sum_tree[index]
        
 
#         max_w = (len(self.memory) * (min_priority/sum_priority)) ** (-beta)
#         current_w = (len(self.memory) * (current_priority/sum_priority)) ** (-beta)
        
#         return current_w / max_w
        return (min_priority / current_priority) ** beta
                 
                 
        
Example #11
0
class PERBuffer:

    # Simple class that holds the different types of memory
    class Memory:
        # Expects all shapes to be tuples, size to be an integer
        def __init__(self, state_shape, action_shape, size):
            self.states = np.zeros((size, ) + state_shape)
            self.actions = np.zeros((size, ) + action_shape)
            self.rewards = np.zeros(size)
            self.next_states = np.zeros((size, ) + state_shape)
            self.dones = np.zeros(size)
            self.size = size

        # memory[i] will return a tuple of the entire memory @ i
        def __getitem__(self, key):
            return (self.states[key], self.actions[key], self.rewards[key],
                    self.next_states[key], self.dones[key])

        # Provides a quick way of updating multiple
        # parts of memory at a specific index
        def update(self,
                   indx,
                   state=None,
                   action=None,
                   reward=None,
                   next_state=None,
                   done=None):
            self.states[indx] = state
            self.actions[indx] = action
            self.rewards[indx] = reward
            self.next_states[indx] = next_state
            self.dones[indx] = done

        # An alternative to __getitem__, returns dict instead
        def get(self, key):
            rtn = {
                "states": self.states[key],
                "actions": self.actions[key],
                "rewards": self.rewards[key],
                "next_states": self.next_states[key],
                "dones": self.dones[key]
            }
            return rtn

    # Creates the replay buffer
    def __init__(self,
                 state_shape,
                 action_shape,
                 size,
                 alpha=0.6,
                 beta=0.4,
                 beta_delta=0.001,
                 epsilon=0.01):
        self.memory = self.Memory(state_shape, action_shape, size)
        self.counter = 0
        self.size = self.memory.size
        # Segment trees
        self.sum_tree = SumSegmentTree(self.size)
        self.min_tree = MinSegmentTree(self.size)
        # P.E.R. hyperparameters
        self.alpha = alpha
        self.beta = beta
        self.beta_delta = beta_delta
        self.epsilon = epsilon
        self.max_priority = 1.0

    # Samples the indexes from memory in accordance to their priority
    def sample_indexes(self, batch_size, max_memory):
        sample_indexes = np.zeros(shape=batch_size)
        # Gets the total probability of all used memory
        prob_total_norm = self.sum_tree.sum(0, max_memory - 1) / batch_size
        # Gets indexes using probability
        for i in range(batch_size):
            # ---VAL MAY NEED TO BE CHANGED---
            val = random.random() * prob_total_norm + i * prob_total_norm
            indx = self.sum_tree.find_prefixsum_idx(val)
            sample_indexes[i] = indx
        return sample_indexes

    # Stores new memory at looping index
    def store(self, state, action, reward, next_state, done):
        indx = self.counter % self.size
        self.memory.update(indx, state, action, reward, next_state, done)
        # Gets the priority alpha for the newly added sample
        priority_alpha = self.max_priority**self.alpha
        # Adds this to the sum and min trees
        self.sum_tree[indx] = priority_alpha
        self.min_tree[indx] = priority_alpha
        # Updates the counter
        self.counter += 1

    # Samples the memory from filled parts of the buffer
    # Returns a tuple (states, actions, rewards, next_states, dones, weights)
    def miniBatch(self, batch_size):
        max_memory = min(self.counter, self.size)
        # Samples the indexes according to their importance
        batch_indxs = self.sample_indexes(batch_size, max_memory)
        batch_indxs = np.int_(batch_indxs)
        # Gets the weights
        weights = np.zeros(shape=batch_size)
        prob_min = self.min_tree.min() / (self.sum_tree.sum() + self.epsilon)
        max_weight = (prob_min * max_memory)**(-self.beta)
        for i in range(0, len(batch_indxs)):
            prob = self.sum_tree[batch_indxs[i]] / \
                (self.sum_tree.sum() + self.epsilon)
            weight = (prob * max_memory)**(-self.beta)
            weight_norm = weight / (max_weight + self.epsilon)
            weights[i] = weight_norm
        # Updates beta
        self.beta = min(1.0, self.beta + self.beta_delta)
        # Returns memory and weights and idxs
        return self.memory[batch_indxs] + (weights, ) + (batch_indxs, )

    # for given indexes and priorities, updates the trees and max
    def update_priorities(self, indxs, priorities):
        for indx, priority in zip(indxs, priorities):
            priority_alpha = priority**self.alpha
            self.sum_tree[indx] = priority_alpha[0]
            self.min_tree[indx] = priority_alpha[0]
            # Sets the max priority to be newest max
            self.max_priority = max(self.max_priority, priority)
Example #12
0
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))
        
        if self._next_idx >= len(self._storage):
            self._storage.append(data)
            #print self._storage
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha


    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
            ret.append(self._storage[idxes[i]])
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #13
0
class PrioritizedReplay(Replay):
    def __init__(self, learner_config, env_config, session_config):
        """
        Create Prioritized Replay buffer.
        :param size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        :param alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        """
        super(PrioritizedReplayBuffer,
              self).__init__(learner_config=learner_config,
                             env_config=env_config,
                             session_config=session_config)

        self._alpha = self.replay_config.alpha
        assert self._alpha > 0

        self._memory = []
        self.memory_size = self.replay_config.memory_size
        it_capacity = 1
        while it_capacity < self.memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def default_config(self):
        conf = super().default_config()
        conf.update({
            'memory_size': '_int_',
            'sampling_start_size': '_int_',
            'alpha': '_float_',
        })
        return conf

    def insert(self, exp_dict):
        """
        Adds experience to the replay buffer as usual, but also
        intiialize the priority of the new experience.
        """
        with self.insert_time.time():
            if self._next_idx >= len(self._memory):
                self._memory.append(exp_dict)
            else:
                self._memory[self._next_idx] = exp_dict
            self._next_idx = (self._next_idx + 1) % self.memory_size

            idx = self._next_idx
            self._it_sum[idx] = self._max_priority**self._alpha
            self._it_min[idx] = self._max_priority**self._alpha

    def sample(self, batch_size, beta=0):
        """
        WARNING: This function does not make deep copies of the tuple experiences.
                 This means that if any objects in the experiences are modified,
                 the contents of the replay buffer memory will also be modified,
                 so be careful!!!
        Sample a batch of experiences, along with their importance weights, and the 
        indices of the sampled experiences in the buffer.
        :param batch_size: int
            How many transitions to sample.
        :param beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)
        :return experience_batch: List
            List of tuples, length batch_size, corresponding to the experiences sampled.
        :return weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        :return indices: np.array
            Array of shape (batch_size,) and dtype np.int32
            indices in buffer of sampled experiences
        """
        with self.sample_time.time():
            assert beta >= 0

            # sample the experiences proportional to their priorities
            indices = self._sample_proportional(batch_size)
            response = [self._storage[idx] for idx in indices]

            # compute importance weights for the experiences to correct for distribution shift
            weights = []
            p_min = self._it_min.min() / self._it_sum.sum()
            max_weight = (p_min * len(self._storage))**(-beta)

            for idx in indices:
                p_sample = self._it_sum[idx] / self._it_sum.sum()
                weight = (p_sample * len(self._storage))**(-beta)
                weights.append(weight / max_weight)
            weights = np.array(weights)

        # return response, weights, indices
        return response

    def _sample_proportional(self, batch_size):
        """
        This is a helper function to sample expriences with probabilities 
        proportional to their priorities.
        Returns a list of indices.
        """
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def update_priorities(self, indices, priorities):
        """
        Update priorities of sampled transitions.
        sets priority of transition at index indices[i] in buffer
        to priorities[i].
        :param indices: [int]
            List of indices of sampled transitions
        :param priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled indices denoted by
            variable `indices`.
        """
        assert len(indices) == len(priorities)
        for idx, priority in zip(indices, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)

    def evict(self):
        raise NotImplementedError  # TODO
        # if evict_size > len(self._memory):
        #     evicted = self._memory
        #     self._memory = []
        #     self._next_idx = 0
        #     return evicted
        # forward_space = len(self._memory) - self._next_idx
        # if evict_size < forward_space:
        #     evicted = self._memory[self._next_idx:self._next_idx+evict_size]
        #     del self._memory[self._next_idx:self._next_idx+evict_size]
        # else:
        #     evicted = self._memory[self._next_idx:]
        #     evict_from_left = evict_size - forward_space
        #     evicted += self._memory[:evict_from_left]
        #     del self._memory[self._next_idx:]
        #     del self._memory[:evict_from_left]
        #     self._next_idx -= evict_from_left
        # assert len(evicted) == evict_size
        # return evicted

    def start_sample_condition(self):
        return len(self) > self.replay_config.sampling_start_size

    def __len__(self):
        return len(self._memory)
class PrioritizedReplayBuffer(SimpleReplayBuffer):
    def __init__(self, MAX_LEN, alpha: float = 0.6):
        """
        Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        SimpleReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(MAX_LEN)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < MAX_LEN:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, experience: Experience):
        """See SimpleReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(experience)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size: int) -> List[int]:
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(
        self,
        batch_size: int,
        beta: float = 0.4
    ) -> Tuple[List[Experience], np.ndarray, List[int]]:  # type: ignore
        """Sample a batch of experiences.

        compared to SimpleReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        experiences: List[Experience]
            batch of experiences
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return (encoded_sample, weights, idxes)

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #16
0
class ReplayMemory(ReplayBuffer):
    def __init__(self, size, alpha):
        super().__init__(size, N_Step_Transition)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def put(self, transitions, priorities):
        idxes = []
        for transition in transitions:
            idx = self.next_idx
            super().put(transition)
            idxes.append(idx)
        self.update_priorities(idxes, priorities)

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self.buffer) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def _encode_sample(self, idxes):
        s_lst, a_lst, r_lst, next_state_lst, done_mask_lst = [], [], [], [], []

        n_step_transitions = N_Step_Transition(*zip(
            *[self.buffer[index] for index in idxes]))

        # 'S_t', 'A_t', 'R_ttpB', 'Gamma_ttpB', 'qS_t', 'S_tpn', 'qS_tpn', 'key'
        S_t = np.array(n_step_transitions.S_t)
        S_tpn = np.array(n_step_transitions.S_tpn)
        R_ttpB = np.array(n_step_transitions.R_ttpB)
        gamma_ttpB = np.array(n_step_transitions.Gamma_ttpB)
        qS_tpn = np.array(n_step_transitions.qS_tpn)
        A_t = np.array(n_step_transitions.A_t, dtype=np.int)
        qS_t = np.array(n_step_transitions.qS_t)
        key = np.array(n_step_transitions.key)

        return S_t, A_t, R_ttpB, S_tpn, gamma_ttpB, qS_tpn, qS_t, key

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self.buffer))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self.buffer))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self.buffer)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)

    def remove_old_experience(self):
        if self.size() > self.maxsize:
            num_excess = self.size() - self.maxsize

            # FIFO
            del self.buffer[:num_excess]
Example #17
0
class PrioritizedReplayMemory:
    def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000):
        super(PrioritizedReplayMemory, self).__init__()
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame = 1

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

    def beta_by_frame(self, frame_idx):
        return min(
            1.0, self.beta_start + frame_idx *
            (1.0 - self.beta_start) / self.beta_frames)

    def push(self, state, action, reward, next_state, done):
        idx = self._next_idx
        exp = self.experience(state, action, reward, next_state, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(exp)
        else:
            self._storage[self._next_idx] = exp
        self._next_idx = (self._next_idx + 1) % self._maxsize

        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _encode_sample(self, idxes):
        states = torch.from_numpy(
            np.array([self._storage[i].state
                      for i in idxes])).float().to(device)
        actions = torch.from_numpy(
            np.array([self._storage[i].action
                      for i in idxes])).float().to(device)
        rewards = torch.from_numpy(
            np.array([self._storage[i].reward
                      for i in idxes])).float().to(device)
        next_states = torch.from_numpy(
            np.array([self._storage[i].next_state
                      for i in idxes])).float().to(device)
        dones = torch.from_numpy(
            np.array([self._storage[i].done
                      for i in idxes]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size):
        idxes = self._sample_proportional(batch_size)

        weights = []

        #find smallest sampling prob: p_min = smallest priority^alpha / sum of priorities^alpha
        p_min = self._it_min.min() / self._it_sum.sum()

        beta = self.beta_by_frame(self.frame)
        self.frame += 1

        #max_weight given to smallest prob
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = torch.tensor(weights, device=device, dtype=torch.float)
        encoded_sample = self._encode_sample(idxes)
        return encoded_sample, idxes, weights

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = (priority + 1e-5)**self._alpha
            self._it_min[idx] = (priority + 1e-5)**self._alpha

            self._max_priority = max(self._max_priority, (priority + 1e-5))
Example #18
0
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
            #print self._storage
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
            ret.append(self._storage[idxes[i]])
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        #print priorities, np.sum(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #19
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, buffer_size, input_dim, batch_size, alpha):

        super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim,
                                                      batch_size)

        # For PER. Parameter settings.
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        tree_capacity = 1
        while tree_capacity < self.buffer_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)

    def store(self, state: np.ndarray, action: int, reward: float,
              next_state: np.ndarray, done: int):

        super().store(state, action, reward, next_state, done)

        self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
        self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
        self.tree_ptr = (self.tree_ptr + 1) % self.buffer_size

    def batch_load(self, beta):

        # indices를 받아오는 부분은 병렬처리!!, 그리고 같은 함수에서 weight도 받을 수 있다.
        indices = self._sample_proportional_indices()

        weights = np.array(
            [self._calculate_weight(idx, beta) for idx in indices])

        return dict(states=self.state_buffer[indices],
                    actions=self.action_buffer[indices],
                    rewards=self.reward_buffer[indices],
                    next_states=self.next_state_buffer[indices],
                    dones=self.done_buffer[indices],
                    weights=weights,
                    indices=indices)

    def update_priorities(self, indices, priorities):

        # 이 부분도 병렬 처리 할 수 있는 구간.
        for idx, priority in zip(indices, priorities):

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def _sample_proportional_indices(self):

        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        # multiprocessing 등을 활용해서 병렬처리 하자
        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            sample = np.random.uniform(a, b)
            idx = self.sum_tree.retrieve(sample)  # sample의 tree에서의 idx를 리턴
            indices.append(idx)

        return indices

    def _calculate_weight(self, idx, beta):

        # 이 부분은 batch 당 weight 구할 때 한번만 하면 될듯.
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        weight /= max_weight
        return weight
Example #20
0
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, action_size, buffer_size, batch_size, alpha):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            alpha (float): alpha PER value 
        """
        self.max_priority = 1.0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        self.tree_capacity = 1
        while self.tree_capacity < buffer_size:
            self.tree_capacity *= 2

        self.sum_tree = SumSegmentTree(self.tree_capacity)
        self.min_tree = MinSegmentTree(self.tree_capacity)

        self.action_size = action_size
        self.memory = []
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, t, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)

        idx = t % self.tree_capacity
        if t >= self.tree_capacity:
            self.memory[idx] = e
        else:
            self.memory.append(e)

        # insert experience index in priority tree
        self.sum_tree[idx] = self.max_priority**self.alpha
        self.min_tree[idx] = self.max_priority**self.alpha

    def sample(self, beta):
        """Sampling a batch of relevant experiences from memory."""
        indices = self.relevant_sample_indx()

        idxs = np.vstack(indices).astype(np.int)
        states = torch.from_numpy(
            np.vstack([self.memory[i].state
                       for i in indices])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([self.memory[i].action
                       for i in indices])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([self.memory[i].reward
                       for i in indices])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([self.memory[i].next_state
                       for i in indices])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([self.memory[i].done
                       for i in indices]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(
            np.array([self.isw(i, beta) for i in indices])).float().to(device)

        return (idxs, states, actions, rewards, next_states, dones, weights)

    def relevant_sample_indx(self):
        """Selecting most informative sample indices."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)

        return indices

    def update_priorities(self, indices, priorities):
        """Update priorities of sampled transitions."""
        assert indices.shape[0] == priorities.shape[0]

        for idx, priority in zip(indices.flatten(), priorities.flatten()):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def isw(self, idx, beta):
        """Compute Importance Sample Weight."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        is_weight = weight / max_weight

        return is_weight

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
Example #21
0
class PrioritizedReplayBuffer(object):
    def __init__(self, size, alpha=0.6):
        """Create Prioritized Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        See Also
        --------
        ReplayBuffer.__init__
        """
        #super(PrioritizedReplayBuffer, self).__init__(size)

        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.it_capacity = 1
        while self.it_capacity < size * 2:  # We use double the soft capacity of the PER for the segment trees to allow for any overflow over the soft capacity limit before samples are removed
            self.it_capacity *= 2

        self._it_sum = SumSegmentTree(self.it_capacity)
        self._it_min = MinSegmentTree(self.it_capacity)
        self._max_priority = 1.0

    def _add(self, obs_t, action, reward, obs_tp1, done): # self, state, policy_output, reward, last_state, done
        data = (obs_t, action, reward, obs_tp1, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        
        self._next_idx = (self._next_idx + 1) % self._maxsize
    
    def _remove(self, num_samples):
        del self._storage[:num_samples]
        self._next_idx = len(self._storage)

    def _encode_sample(self, idxes):
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        for i in idxes:
            data = self._storage[i]
            obs_t, action, reward, obs_tp1, done = data
            obses_t.append(np.array(obs_t, copy=False))
            actions.append(action)
            rewards.append(reward)
            obses_tp1.append(np.array(obs_tp1, copy=False))
            dones.append(done)

        return [np.array(obses_t), actions, np.array(rewards),\
                np.array(obses_tp1), np.array(dones)]

    def add(self, state, policy_output, reward, last_state, done): # self, state, policy_output, reward, last_state, done
        idx = self._next_idx
        #assert idx < self.it_capacity, "Number of samples in replay memory exceeds capacity of segment trees. Please increase capacity of segment trees or increase the frequency at which samples are removed from the replay memory"

        self._add(state, policy_output, reward, last_state, done)
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def remove(self, num_samples):
        self._remove(num_samples)
        self._it_sum.remove_items(num_samples)
        self._it_min.remove_items(num_samples)

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def miniBatch(self, batch_size, beta=0.4, epsilon=1e-8):
        """Sample a batch of experiences.
        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.
        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)
        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        gammas: np.array
            product of gammas for N-step returns
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / (self._it_sum.sum() + epsilon)
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / (self._it_sum.sum() + epsilon)
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / (max_weight + epsilon))
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)

        return encoded_sample, idxes, weights

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index idxes[i] in buffer
        to priorities[i].
        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
    
    def get_size(self):
        return len(self._storage)
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Example #23
0
class PrioritizedReplayMemory(object):
    def __init__(self,
                 capacity=100000,
                 priority_fraction=0.0,
                 discount_gamma_game_reward=1.0,
                 discount_gamma_graph_reward=1.0,
                 discount_gamma_count_reward=1.0,
                 accumulate_reward_from_final=False):
        # prioritized replay memory
        self._storage = []
        self.capacity = capacity
        self._next_idx = 0

        assert priority_fraction >= 0
        self._alpha = priority_fraction

        it_capacity = 1
        while it_capacity < capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.discount_gamma_game_reward = discount_gamma_game_reward
        self.discount_gamma_graph_reward = discount_gamma_graph_reward
        self.discount_gamma_count_reward = discount_gamma_count_reward
        self.accumulate_reward_from_final = accumulate_reward_from_final

    def __len__(self):
        return len(self._storage)

    @property
    def storage(self):
        """[(np.ndarray, float, float, np.ndarray, bool)]: content of the replay buffer"""
        return self._storage

    @property
    def buffer_size(self):
        """float: Max capacity of the buffer"""
        return self.capacity

    def can_sample(self, n_samples):
        """
        Check if n_samples samples can be sampled
        from the buffer.
        :param n_samples: (int)
        :return: (bool)
        """
        return len(self) >= n_samples

    def is_full(self):
        """
        Check whether the replay buffer is full or not.
        :return: (bool)
        """
        return len(self) == self.buffer_size

    def add(self, *args):
        """
        add a new transition to the buffer
        """
        idx = self._next_idx
        data = Transition(*args)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self.capacity
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def get_next_final_pos(self, which_memory, head):
        i = head
        while True:
            if i >= len(self._storage):
                return None
            if self._storage[i].is_final:
                return i
            i += 1
        return None

    def _get_single_transition(self, idx, n):
        assert n > 0
        head = idx
        # if n is 1, then head can't be is_final
        if n == 1:
            if self._storage[head].is_final:
                return None
        #  if n > 1, then all except tail can't be is_final
        else:
            if np.any([item.is_final
                       for item in self._storage[head:head + n]]):
                return None

        next_final = self.get_next_final_pos(self._storage, head)
        if next_final is None:
            return None

        # all good
        obs = self._storage[head].observation_list
        prev_action = self._storage[head].prev_action_list
        candidate = self._storage[head].action_candidate_list
        chosen_indices = self._storage[head].chosen_indices
        graph_triplets = self._storage[head].graph_triplets

        next_obs = self._storage[head + n].observation_list
        next_prev_action = self._storage[head + n].prev_action_list
        next_candidate = self._storage[head + n].action_candidate_list
        next_graph_triplets = self._storage[head + n].graph_triplets

        tmp = next_final - head + 1 if self.accumulate_reward_from_final else n + 1

        rewards_up_to_next_final = [
            self.discount_gamma_game_reward**i * self._storage[head + i].reward
            for i in range(tmp)
        ]
        reward = torch.sum(torch.stack(rewards_up_to_next_final))

        graph_rewards_up_to_next_final = [
            self.discount_gamma_graph_reward**i *
            self._storage[head + i].graph_reward for i in range(tmp)
        ]
        graph_reward = torch.sum(torch.stack(graph_rewards_up_to_next_final))

        count_rewards_up_to_next_final = [
            self.discount_gamma_count_reward**i *
            self._storage[head + i].count_reward for i in range(tmp)
        ]
        count_reward = torch.sum(torch.stack(count_rewards_up_to_next_final))

        return (obs, prev_action, candidate, chosen_indices, graph_triplets,
                reward + graph_reward + count_reward, next_obs,
                next_prev_action, next_candidate, next_graph_triplets)

    def _encode_sample(self, idxes, ns):
        actual_indices, actual_ns = [], []
        obs, prev_action, candidate, chosen_indices, graph_triplets, reward, next_obs, next_prev_action, next_candidate, next_graph_triplets = [], [], [], [], [], [], [], [], [], []
        for i, n in zip(idxes, ns):
            t = self._get_single_transition(i, n)
            if t is None:
                continue
            actual_indices.append(i)
            actual_ns.append(n)
            obs.append(t[0])
            prev_action.append(t[1])
            candidate.append(t[2])
            chosen_indices.append(t[3])
            graph_triplets.append(t[4])
            reward.append(t[5])
            next_obs.append(t[6])
            next_prev_action.append(t[7])
            next_candidate.append(t[8])
            next_graph_triplets.append(t[9])
        if len(actual_indices) == 0:
            return None
        chosen_indices = np.array(chosen_indices)  # batch
        reward = torch.stack(reward, 0)  # batch
        actual_ns = np.array(actual_ns)

        return [
            obs, prev_action, candidate, chosen_indices, graph_triplets,
            reward, next_obs, next_prev_action, next_candidate,
            next_graph_triplets, actual_indices, actual_ns
        ]

    def sample(self, batch_size, beta=0, multi_step=1):

        assert beta > 0

        idxes = self._sample_proportional(batch_size)
        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        # sample n
        ns = np.random.randint(1, multi_step + 1, size=batch_size)
        encoded_sample = self._encode_sample(idxes, ns)
        if encoded_sample is None:
            return None
        actual_indices = encoded_sample[-2]
        for idx in actual_indices:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)

        return encoded_sample + [weights]

    def _get_single_sequence_transition(self, idx, sample_history_length):
        assert sample_history_length > 0
        head = idx
        # if n is 1, then head can't be is_final
        if sample_history_length == 1:
            if self._storage[head].is_final:
                return None
        #  if n > 1, then all except tail can't be is_final
        else:
            if np.any([
                    item.is_final
                    for item in self._storage[head:head +
                                              sample_history_length]
            ]):
                return None

        next_final = self.get_next_final_pos(self._storage, head)
        if next_final is None:
            return None

        # all good
        res = []
        for m in range(sample_history_length):
            obs = self._storage[head + m].observation_list
            candidate = self._storage[head + m].action_candidate_list
            chosen_indices = self._storage[head + m].chosen_indices
            graph_triplets = self._storage[head + m].graph_triplets

            next_obs = self._storage[head + m + 1].observation_list
            next_candidate = self._storage[head + m + 1].action_candidate_list
            next_graph_triplets = self._storage[head + m + 1].graph_triplets

            tmp = next_final - (
                head + m) + 1 if self.accumulate_reward_from_final else 1

            rewards_up_to_next_final = [
                self.discount_gamma_game_reward**i *
                self._storage[head + m + i].reward for i in range(tmp)
            ]
            reward = torch.sum(torch.stack(rewards_up_to_next_final))

            graph_rewards_up_to_next_final = [
                self.discount_gamma_graph_reward**i *
                self._storage[head + m + i].graph_reward for i in range(tmp)
            ]
            graph_reward = torch.sum(
                torch.stack(graph_rewards_up_to_next_final))

            count_rewards_up_to_next_final = [
                self.discount_gamma_count_reward**i *
                self._storage[head + m + i].count_reward for i in range(tmp)
            ]
            count_reward = torch.sum(
                torch.stack(count_rewards_up_to_next_final))

            res.append([
                obs, candidate, chosen_indices, graph_triplets,
                reward + graph_reward + count_reward, next_obs, next_candidate,
                next_graph_triplets
            ])
        return res

    def _encode_sample_sequence(self, idxes, sample_history_length):
        assert sample_history_length > 0
        res = []
        for _ in range(sample_history_length):
            tmp = []
            for i in range(8):
                tmp.append([])
            res.append(tmp)

        actual_indices = []
        # obs, candidate, chosen_indices, graph_triplets, reward, next_obs, next_candidate, next_graph_triplets
        for i in idxes:
            t = self._get_single_sequence_transition(i, sample_history_length)
            if t is None:
                continue
            actual_indices.append(i)
            for step in range(sample_history_length):
                t_s = t[step]
                res[step][0].append(t_s[0])
                res[step][1].append(t_s[1])
                res[step][2].append(t_s[2])
                res[step][3].append(t_s[3])
                res[step][4].append(t_s[4])
                res[step][5].append(t_s[5])
                res[step][6].append(t_s[6])
                res[step][7].append(t_s[7])

        if len(actual_indices) == 0:
            return None
        for i in range(sample_history_length):
            res[i][2] = np.array(res[i][2])  # batch
            res[i][4] = torch.stack(res[i][4], 0)  # batch

        return res + [actual_indices]

    def sample_sequence(self, batch_size, beta=0, sample_history_length=1):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)
        res_weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        encoded_sample = self._encode_sample_sequence(idxes,
                                                      sample_history_length)
        if encoded_sample is None:
            return None
        actual_indices = encoded_sample[-1]
        for _h in range(sample_history_length):
            tmp_weights = []
            for idx in actual_indices:
                p_sample = self._it_sum[idx + _h] / self._it_sum.sum()
                weight = (p_sample * len(self._storage))**(-beta)
                tmp_weights.append(weight / max_weight)
            tmp_weights = np.array(tmp_weights)
            res_weights.append(tmp_weights)

        return encoded_sample + [res_weights]

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def update_priorities(self, idxes, priorities):
        """
        Update priorities of sampled transitions.
        sets priority of transition at index idxes[i] in buffer
        to priorities[i].
        :param idxes: ([int]) List of idxes of sampled transitions
        :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes
            denoted by variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            if priority > 0:
                assert 0 <= idx < len(self._storage)
                self._it_sum[idx] = priority**self._alpha
                self._it_min[idx] = priority**self._alpha
                self._max_priority = max(self._max_priority, priority)
            else:
                print("something wrong with priority: ", str(priority))
                return False
        return True

    def avg_rewards(self):
        if len(self._storage) == 0:
            return 0.0
        rewards = [self._storage[i].reward for i in range(len(self._storage))]
        return to_np(torch.mean(torch.stack(rewards)))
Example #24
0
class PrioritizedReplayBuffer(ReplayBuffer):
    """Prioritized Replay buffer.
    
    Attributes:
        max_priority (float): max priority
        tree_ptr (int): next index of tree
        alpha (float): alpha parameter for prioritized replay buffer
        sum_tree (SumSegmentTree): sum tree for prior
        min_tree (MinSegmentTree): min tree for min prior to get max weight
        
    """
    def __init__(
        self,
        obs_dim: int,
        size: int,
        batch_size: int = 32,
        alpha: float = 0.6,
        n_step: int = 1,
        gamma: float = 0.99,
    ):
        """Initialization."""
        assert alpha >= 0

        super(PrioritizedReplayBuffer,
              self).__init__(obs_dim, size, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)

    def store(
        self,
        obs: np.ndarray,
        act: int,
        rew: float,
        next_obs: np.ndarray,
        done: bool,
    ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]:
        """Store experience and priority."""
        transition = super().store(obs, act, rew, next_obs, done)

        if transition:
            self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.tree_ptr = (self.tree_ptr + 1) % self.max_size

        return transition

    def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]:
        """Sample a batch of experiences."""
        assert len(self) >= self.batch_size
        assert beta > 0

        indices = self._sample_proportional()

        obs = self.obs_buf[indices]
        next_obs = self.next_obs_buf[indices]
        acts = self.acts_buf[indices]
        rews = self.rews_buf[indices]
        done = self.done_buf[indices]
        weights = np.array([self._calculate_weight(i, beta) for i in indices])

        return dict(
            obs=obs,
            next_obs=next_obs,
            acts=acts,
            rews=rews,
            done=done,
            weights=weights,
            indices=indices,
        )

    def update_priorities(self, indices: List[int], priorities: np.ndarray):
        """Update priorities of sampled transitions."""
        assert len(indices) == len(priorities)

        for idx, priority in zip(indices, priorities):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def _sample_proportional(self) -> List[int]:
        """Sample indices based on proportions."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)

        return indices

    def _calculate_weight(self, idx: int, beta: float):
        """Calculate the weight of the experience at idx."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        weight = weight / max_weight

        return weight
Example #25
0
class PrioritizedReplayBuffer(ReplayBuffer):
    """
    Adapt from https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/buffers.py
    """
    def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False):
        super().__init__(obs_space, action_space, capacity, device,
                         optimize_memory_usage=optimize_memory_usage)
        assert exponent >= 0
        self.exponent = exponent

        it_capacity = 1
        while it_capacity < self.capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def _sample_proportional(self, batch_size):
        total = self._it_sum.sum(0, len(self) - 1)
        mass = np.random.random(size=batch_size) * total
        idx = self._it_sum.find_prefixsum_idx(mass)

        # replace idx == self.idx
        if self.full and self.optimize_memory_usage:
            while np.any(idx == self.idx):
                replace_mass = np.random.random(len(idx == self.idx)) * total
                replace_idx = self._it_sum.find_prefixsum_idx(replace_mass)
                idx[idx == self.idx] = replace_idx

        return idx

    def add(self, obs, action, reward, next_obs, done):
        idx = self.idx
        super().add(obs, action, reward, next_obs, done)
        self._it_sum[idx] = self._max_priority ** self.exponent
        self._it_min[idx] = self._max_priority ** self.exponent

    def sample(self, batch_size, beta=0):
        assert beta >= 0

        idxes = self._sample_proportional(batch_size)
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self)) ** (-beta)
        p_sample = self._it_sum[idxes] / self._it_sum.sum()
        weights = (p_sample * len(self)) ** (-beta) / max_weight
        obses, actions, rewards, next_obses, not_dones = self._sample(idxes)

        priority_kwargs = {
            'weights': weights,
            'idxes': idxes
        }

        return obses, actions, rewards, next_obses, not_dones, priority_kwargs

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        assert np.min(priorities) > 0
        assert np.min(idxes) >= 0
        assert np.max(idxes) < len(self)
        self._it_sum[idxes] = priorities ** self.exponent
        self._it_min[idxes] = priorities ** self.exponent

        self._max_priority = max(self._max_priority, np.max(priorities))