コード例 #1
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     """
     self.config = config
     self.buff_sz = check_attribute_else_default(config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(config, 'batch_sz', 1)
     self.frame_stack = check_attribute_else_default(
         config, 'frame_stack', 4)
     self.env_state_dims = list(
         check_attribute_else_default(config, 'env_state_dims', [2, 2]))
     self.num_actions = check_attribute_else_default(
         config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(config, 'obs_dtype',
                                                   np.uint8)
     self.reward_clipping = check_attribute_else_default(
         config, 'reward_clipping', False)
     self.sigma = check_attribute_else_default(config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         config, 'sigma_decay', 1.0)
     """ Parameters for Return Function """
     assert isinstance(return_function, OnPolicyQSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.sigma = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64)
コード例 #2
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     store_bprobs        bool            False               whether to store and use the behaviour policy probabilities
                                                             for the return function
     store_sigma         bool            False               whether to store sigma at every time step and use
                                                             the stored sigmas to compute the return. True = use the
                                                             sigma from the buffer, False = use the current sigma
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.frame_stack = check_attribute_else_default(
         self.config, 'frame_stack', 4)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.reward_clipping = check_attribute_else_default(
         self.config, 'reward_clipping', False)
     self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         self.config, 'sigma_decay', 1.0)
     self.store_bprobs = check_attribute_else_default(
         self.config, 'store_bprobs', False)
     self.store_sigma = check_attribute_else_default(
         self.config, 'store_sigma', False)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, QSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     if self.store_bprobs:
         self.bprobabilities = CircularBuffer(self.buff_sz,
                                              shape=(self.num_actions, ),
                                              dtype=np.float64)
     if self.store_sigma:
         self.sigma_buffer = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
コード例 #3
0
class QSigmaExperienceReplayBuffer:
    def __init__(self, config, return_function):
        """ Parameters:
        Name:               Type:           Default:            Description: (Omitted when self-explanatory)
        buff_sz             int             10                  buffer size
        batch_sz            int             1
        frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
        env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
        num_actions         int             2                   number of actions available to the agent
        obs_dtype           np.type         np.uint8            the data type of the observations
        reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
        sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
        sigma_decay         float           1.0                 decay rate of sigma
        store_bprobs        bool            False               whether to store and use the behaviour policy probabilities
                                                                for the return function
        store_sigma         bool            False               whether to store sigma at every time step and use
                                                                the stored sigmas to compute the return. True = use the
                                                                sigma from the buffer, False = use the current sigma
        initial_rand_steps  int             0                   number of random steps before decaying sigma
        rand_steps_count    int             0                   number of random steps taken so far
        store_return        bool            True                save the computed return so that it can be reused
        """
        assert isinstance(config, Config)
        self.config = config
        self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
        self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                     1)
        self.frame_stack = check_attribute_else_default(
            self.config, 'frame_stack', 4)
        self.env_state_dims = list(
            check_attribute_else_default(self.config, 'env_state_dims',
                                         [2, 2]))
        self.num_actions = check_attribute_else_default(
            self.config, 'num_actions', 2)
        self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                      np.uint8)
        self.reward_clipping = check_attribute_else_default(
            self.config, 'reward_clipping', False)
        self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
        self.sigma_decay = check_attribute_else_default(
            self.config, 'sigma_decay', 1.0)
        self.store_bprobs = check_attribute_else_default(
            self.config, 'store_bprobs', False)
        self.store_sigma = check_attribute_else_default(
            self.config, 'store_sigma', False)
        self.initial_rand_steps = check_attribute_else_default(
            self.config, 'initial_rand_steps', 0)
        check_attribute_else_default(self.config, 'rand_steps_count', 0)
        self.store_return = check_attribute_else_default(
            self.config, 'store_return', True)
        """ Parameters for Return Function """
        assert isinstance(return_function, QSigmaReturnFunction)
        self.return_function = return_function
        self.n = return_function.n
        """ Parameters to keep track of the current state of the buffer """
        self.current_index = 0
        self.full_buffer = False
        """ Circular Buffers """
        self.state = CircularBuffer(self.buff_sz,
                                    shape=tuple(self.env_state_dims),
                                    dtype=self.obs_dtype)
        self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
        self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
        self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
        if self.store_bprobs:
            self.bprobabilities = CircularBuffer(self.buff_sz,
                                                 shape=(self.num_actions, ),
                                                 dtype=np.float64)
        if self.store_sigma:
            self.sigma_buffer = CircularBuffer(self.buff_sz,
                                               shape=(),
                                               dtype=np.float64)
        self.estimated_return = CircularBuffer(self.buff_sz,
                                               shape=(),
                                               dtype=np.float64)
        self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)

    def store_observation(self, observation):
        """ The only two keys that are required are 'state' """
        assert isinstance(observation, dict)
        assert all(akey in observation.keys()
                   for akey in ["reward", "action", "state", "terminate"])

        temp_terminate = observation['terminate']
        reward = observation["reward"]
        if self.reward_clipping:
            if reward > 0: reward = 1
            elif reward < 0: reward = -1

        self.state.append(observation["state"])
        self.action.append(observation["action"])
        self.reward.append(reward)
        self.terminate.append(temp_terminate)
        if self.store_bprobs:
            assert hasattr(self, 'bprobabilities')
            assert 'bprobabilities' in observation.keys()
            self.bprobabilities.append(observation["bprobabilities"])
        if self.store_sigma:
            assert hasattr(self, 'sigma')
            self.sigma_buffer.append(self.sigma)
        self.estimated_return.append(0.0)
        self.up_to_date.append(False)

        self.current_index += 1
        if self.current_index >= self.buff_sz:
            self.current_index = 0
            self.full_buffer = True

        if temp_terminate and self.config.rand_steps_count >= self.initial_rand_steps:
            self.sigma *= self.sigma_decay
            if self.sigma < 1e-10:  # to prevent underflow
                self.sigma = 0.0
            self.config.sigma = self.sigma

    def sample_indices(self):
        bf_start = self.terminate.start
        inds_start = self.frame_stack - 1
        if not self.full_buffer:
            inds_end = self.current_index - (self.n + 1)
        else:
            inds_end = self.buff_sz - 1 - (self.n + 1)
        sample_inds = np.random.randint(inds_start,
                                        inds_end,
                                        size=self.batch_sz)
        terminations = self.terminate.data.take(bf_start + sample_inds,
                                                axis=0,
                                                mode='wrap')
        terminations_sum = np.sum(terminations)
        while terminations_sum != 0:
            bad_inds = np.squeeze(np.argwhere(terminations))
            new_inds = np.random.randint(inds_start,
                                         inds_end,
                                         size=terminations_sum)
            sample_inds[bad_inds] = new_inds
            terminations = self.terminate.data.take(bf_start + sample_inds,
                                                    axis=0,
                                                    mode='wrap')
            terminations_sum = np.sum(terminations)
        return sample_inds

    def get_data(self, update_function):
        indices = self.sample_indices()
        bf_start = self.action.start

        estimated_returns = np.zeros(self.batch_sz, dtype=np.float64)

        sample_states = np.zeros(
            (self.batch_sz, self.frame_stack) + tuple(self.env_state_dims),
            dtype=self.obs_dtype)
        sample_actions = self.action.data.take(bf_start + indices,
                                               mode='wrap',
                                               axis=0)
        # Abbreviations: tj = trajectory, tjs = trajectories
        tjs_states = np.zeros(
            shape=(self.batch_sz * self.n, self.frame_stack) +
            tuple(self.env_state_dims),
            dtype=self.obs_dtype)
        tjs_actions = np.zeros(self.batch_sz * self.n, np.uint8)
        tjs_rewards = np.zeros(self.batch_sz * self.n, np.int32)
        tjs_terminations = np.ones(self.batch_sz * self.n, np.bool)
        tjs_bprobabilities = np.ones(
            [self.batch_sz * self.n, self.num_actions], np.float64)
        tjs_sigmas = np.ones(self.batch_sz * self.n,
                             dtype=np.float64) * self.sigma

        batch_idx = 0
        tj_start_idx = 0
        retrieved_count = 0
        computed_return_buffer_inds = np.zeros(self.batch_sz, dtype=np.int64)
        computed_return_batch_inds = np.zeros(self.batch_sz, dtype=np.int64)
        for idx in indices:
            assert not self.terminate[idx]
            start_idx = idx - (self.frame_stack - 1)
            # First terminal state from the left. Reversed because we want to find the first terminal state before
            # the current state
            left_terminal_rev = self.terminate.data.take(
                bf_start + start_idx + np.arange(self.frame_stack),
                mode='wrap',
                axis=0)[::-1]
            left_terminal_rev_idx = np.argmax(left_terminal_rev)
            left_terminal_idx = 0 if left_terminal_rev_idx == 0 else (
                self.frame_stack - 1) - left_terminal_rev_idx

            if self.up_to_date.data.take(bf_start + idx, axis=0,
                                         mode='wrap') and self.store_return:
                estimated_returns[batch_idx] = self.estimated_return[idx]
                sample_state = self.state.data.take(
                    bf_start + start_idx + np.arange(self.frame_stack),
                    mode='wrap',
                    axis=0)
                sample_state[:left_terminal_idx] *= 0
                sample_states[batch_idx] += sample_state
                retrieved_count += 1
                batch_idx += 1
            else:
                # First terminal state from center to right
                right_terminal = self.terminate.data.take(
                    bf_start + idx + np.arange(self.n + 1),
                    mode='wrap',
                    axis=0)
                right_terminal_true_idx = np.argmax(right_terminal)
                right_terminal_stop = self.n if right_terminal_true_idx == 0 else right_terminal_true_idx

                # trajectory indices
                tj_end_idx = tj_start_idx + right_terminal_stop - 1
                tj_slice = slice(tj_start_idx, tj_end_idx + 1)
                tj_indices = idx + 1 + np.arange(right_terminal_stop)

                # Collecting: trajectory actions, rewards, terminations, bprobabilities, and sigmas
                tjs_actions[tj_slice] = self.action.data.take(bf_start +
                                                              tj_indices,
                                                              axis=0,
                                                              mode='wrap')
                tjs_rewards[tj_slice] = self.reward.data.take(bf_start +
                                                              tj_indices,
                                                              axis=0,
                                                              mode='wrap')
                tjs_terminations[tj_slice] = self.terminate.data.take(
                    bf_start + tj_indices, axis=0, mode='wrap')
                if self.store_bprobs:
                    tjs_bprobabilities[
                        tj_slice] = self.bprobabilities.data.take(bf_start +
                                                                  tj_indices,
                                                                  axis=0,
                                                                  mode='wrap')
                if self.store_sigma:
                    tjs_sigmas[tj_slice] = self.sigma_buffer.data.take(
                        bf_start + tj_indices, axis=0, mode='wrap')

                # Stacks of states
                trj_state_stack_sz = self.frame_stack + right_terminal_stop
                trj_state_stack = self.state.data.take(
                    bf_start + start_idx + np.arange(trj_state_stack_sz),
                    mode='wrap',
                    axis=0)
                trj_state_stack[:left_terminal_idx] *= 0

                state_stack_slices = np.arange(trj_state_stack_sz - self.frame_stack + 1)[:, None] \
                                     + np.arange(self.frame_stack)
                state_stacks = trj_state_stack.take(state_stack_slices, axis=0)

                sample_states[batch_idx] = state_stacks[0]
                tjs_states[tj_slice] = state_stacks[1:]

                computed_return_buffer_inds[batch_idx - retrieved_count] += idx
                computed_return_batch_inds[batch_idx -
                                           retrieved_count] += batch_idx
                tj_start_idx += self.n
                batch_idx += 1

        # We wait until the end to retrieve the q_values because it's more efficient to make only one call to
        # update_function when using a gpu.
        adjusted_batch_sz = self.batch_sz - retrieved_count
        tjs_states = np.squeeze(
            tjs_states[:adjusted_batch_sz *
                       self.n]).reshape((adjusted_batch_sz * self.n, ) +
                                        tuple(self.env_state_dims))
        tjs_qvalues = update_function(tjs_states, reshape=False).reshape(
            [adjusted_batch_sz, self.n, self.num_actions])
        tjs_actions = tjs_actions[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        tjs_rewards = tjs_rewards[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        tjs_terminations = tjs_terminations[:adjusted_batch_sz *
                                            self.n].reshape(
                                                [adjusted_batch_sz, self.n])
        tjs_bprobabilities = tjs_bprobabilities[:adjusted_batch_sz *
                                                self.n].reshape([
                                                    adjusted_batch_sz, self.n,
                                                    self.num_actions
                                                ])
        tjs_sigmas = tjs_sigmas[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        if self.store_sigma:
            tjs_sigmas *= self.sigma

        computed_return_batch_inds = computed_return_batch_inds[:self.
                                                                batch_sz -
                                                                retrieved_count]
        estimated_returns[computed_return_batch_inds] = \
            self.return_function.batch_iterative_return_function(tjs_rewards, tjs_actions, tjs_qvalues,
                                                                 tjs_terminations, tjs_bprobabilities, tjs_sigmas,
                                                                 adjusted_batch_sz)

        computed_return_buffer_inds = computed_return_buffer_inds[:self.
                                                                  batch_sz -
                                                                  retrieved_count]
        self.estimated_return.data.put(
            indices=bf_start + computed_return_buffer_inds,
            values=estimated_returns[computed_return_batch_inds],
            mode='wrap')
        self.up_to_date.data.put(indices=bf_start +
                                 computed_return_buffer_inds,
                                 values=True,
                                 mode='wrap')
        return sample_states, sample_actions, estimated_returns

    def ready_to_sample(self):
        return self.batch_sz < (self.current_index -
                                (self.n + self.frame_stack))

    def out_of_date(self):
        self.up_to_date.data[:] = False
コード例 #4
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, nStep_Retrace_ReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Termination or Timeout Count for Applying the Decay on Sigma """
     self.episodes_since_last_decay = 0
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.bprobabilities = CircularBuffer(self.buff_sz,
                                          shape=(self.num_actions, ),
                                          dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
コード例 #5
0
class nStep_Retrace_ExperienceReplay:
    def __init__(self, config, return_function):
        """ Parameters:
        Name:               Type:           Default:            Description: (Omitted when self-explanatory)
        buff_sz             int             10                  buffer size
        batch_sz            int             1
        env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
        num_actions         int             2                   number of actions available to the agent
        obs_dtype           np.type         np.uint8            the data type of the observations
        initial_rand_steps  int             0                   number of random steps before decaying sigma
        rand_steps_count    int             0                   number of random steps taken so far
        store_return        bool            True                save the computed return so that it can be reused
        """
        assert isinstance(config, Config)
        self.config = config
        self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
        self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                     1)
        self.env_state_dims = list(
            check_attribute_else_default(self.config, 'env_state_dims',
                                         [2, 2]))
        self.num_actions = check_attribute_else_default(
            self.config, 'num_actions', 2)
        self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                      np.uint8)
        self.initial_rand_steps = check_attribute_else_default(
            self.config, 'initial_rand_steps', 0)
        check_attribute_else_default(self.config, 'rand_steps_count', 0)
        self.store_return = check_attribute_else_default(
            self.config, 'store_return', True)
        """ Parameters for Return Function """
        assert isinstance(return_function, nStep_Retrace_ReturnFunction)
        self.return_function = return_function
        self.n = return_function.n
        """ Termination or Timeout Count for Applying the Decay on Sigma """
        self.episodes_since_last_decay = 0
        """ Parameters to keep track of the current state of the buffer """
        self.current_index = 0
        self.full_buffer = False
        """ Circular Buffers """
        self.state = CircularBuffer(self.buff_sz,
                                    shape=tuple(self.env_state_dims),
                                    dtype=self.obs_dtype)
        self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
        self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
        self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
        self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
        self.bprobabilities = CircularBuffer(self.buff_sz,
                                             shape=(self.num_actions, ),
                                             dtype=np.float64)
        self.estimated_return = CircularBuffer(self.buff_sz,
                                               shape=(),
                                               dtype=np.float64)
        self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)

    def store_observation(self, observation):
        """ The only two keys that are required are 'state' """
        assert isinstance(observation, dict)
        assert all(akey in observation.keys()
                   for akey in ["reward", "action", "state", "terminate"])

        temp_terminate = observation['terminate']
        temp_timeout = observation['timeout']
        reward = observation["reward"]

        self.state.append(observation["state"])
        self.action.append(observation["action"])
        self.reward.append(reward)
        self.terminate.append(temp_terminate)
        self.timeout.append(temp_timeout)
        assert hasattr(self, 'bprobabilities')
        assert 'bprobabilities' in observation.keys()
        self.bprobabilities.append(observation["bprobabilities"])
        self.estimated_return.append(0.0)
        self.up_to_date.append(False)

        self.current_index += 1
        if self.current_index >= self.buff_sz:
            self.current_index = 0
            self.full_buffer = True

    def sample_indices(self):
        bf_start = self.terminate.start
        inds_start = 0
        if not self.full_buffer:
            inds_end = self.current_index - (self.n + 1)
        else:
            inds_end = self.buff_sz - 1 - (self.n + 1)
        sample_inds = np.random.randint(inds_start,
                                        inds_end,
                                        size=self.batch_sz)
        terminations_timeout = np.logical_or(
            self.terminate.data.take(bf_start + sample_inds,
                                     axis=0,
                                     mode='wrap'),
            self.timeout.data.take(bf_start + sample_inds, axis=0,
                                   mode='wrap'))
        terminations_timeout_sum = np.sum(terminations_timeout)
        while terminations_timeout_sum != 0:
            bad_inds = np.squeeze(np.argwhere(terminations_timeout))
            new_inds = np.random.randint(inds_start,
                                         inds_end,
                                         size=terminations_timeout_sum)
            sample_inds[bad_inds] = new_inds
            terminations_timeout = np.logical_or(
                self.terminate.data.take(bf_start + sample_inds,
                                         axis=0,
                                         mode='wrap'),
                self.timeout.data.take(bf_start + sample_inds,
                                       axis=0,
                                       mode='wrap'))
            terminations_timeout_sum = np.sum(terminations_timeout)
        return sample_inds

    def get_data(self, update_function):
        indices = self.sample_indices()
        bf_start = self.action.start

        estimated_returns = np.zeros(self.batch_sz, dtype=np.float64)

        sample_states = np.zeros(
            (self.batch_sz, 1) + tuple(self.env_state_dims),
            dtype=self.obs_dtype)
        sample_actions = self.action.data.take(bf_start + indices,
                                               mode='wrap',
                                               axis=0)
        # Abbreviations: tj = trajectory, tjs = trajectories
        tjs_states = np.zeros(shape=(self.batch_sz * self.n, 1) +
                              tuple(self.env_state_dims),
                              dtype=self.obs_dtype)
        tjs_actions = np.zeros(self.batch_sz * self.n, np.uint8)
        tjs_rewards = np.zeros(self.batch_sz * self.n, np.int32)
        tjs_terminations = np.ones(self.batch_sz * self.n, np.bool)
        tjs_timeout = np.zeros(self.batch_sz * self.n, np.bool)
        tjs_bprobabilities = np.ones(
            [self.batch_sz * self.n, self.num_actions], np.float64)

        batch_idx = 0
        tj_start_idx = 0
        retrieved_count = 0
        computed_return_buffer_inds = np.zeros(self.batch_sz, dtype=np.int64)
        computed_return_batch_inds = np.zeros(self.batch_sz, dtype=np.int64)
        for idx in indices:
            assert not self.terminate[idx] and not self.timeout[idx]
            start_idx = idx
            # First terminal state from the left. Reversed because we want to find the first terminal state before
            # the current state
            left_terminal_idx = 0

            if self.up_to_date.data.take(bf_start + idx, axis=0,
                                         mode='wrap') and self.store_return:
                estimated_returns[batch_idx] = self.estimated_return[idx]
                sample_state = self.state.data.take(bf_start + start_idx +
                                                    np.arange(1),
                                                    mode='wrap',
                                                    axis=0)
                sample_states[batch_idx] += sample_state
                retrieved_count += 1
                batch_idx += 1
            else:
                # First terminal or timeout state from center to right
                right_terminal = np.logical_or(
                    self.terminate.data.take(bf_start + idx +
                                             np.arange(self.n + 1),
                                             mode='wrap',
                                             axis=0),
                    self.timeout.data.take(bf_start + idx +
                                           np.arange(self.n + 1),
                                           mode='wrap',
                                           axis=0))
                right_terminal_true_idx = np.argmax(right_terminal)
                right_terminal_stop = self.n if right_terminal_true_idx == 0 else right_terminal_true_idx

                # trajectory indices
                tj_end_idx = tj_start_idx + right_terminal_stop - 1
                tj_slice = slice(tj_start_idx, tj_end_idx + 1)
                tj_indices = idx + 1 + np.arange(right_terminal_stop)

                # Collecting: trajectory actions, rewards, terminations, bprobabilities, and sigmas
                tjs_actions[tj_slice] = self.action.data.take(bf_start +
                                                              tj_indices,
                                                              axis=0,
                                                              mode='wrap')
                tjs_rewards[tj_slice] = self.reward.data.take(bf_start +
                                                              tj_indices,
                                                              axis=0,
                                                              mode='wrap')
                tjs_terminations[tj_slice] = self.terminate.data.take(
                    bf_start + tj_indices, axis=0, mode='wrap')
                tjs_timeout[tj_slice] = self.timeout.data.take(bf_start +
                                                               tj_indices,
                                                               axis=0,
                                                               mode='wrap')
                tjs_bprobabilities[tj_slice] = self.bprobabilities.data.take(
                    bf_start + tj_indices, axis=0, mode='wrap')
                # Stacks of states
                trj_state_stack_sz = 1 + right_terminal_stop
                trj_state_stack = self.state.data.take(
                    bf_start + start_idx + np.arange(trj_state_stack_sz),
                    mode='wrap',
                    axis=0)
                trj_state_stack[:left_terminal_idx] *= 0

                state_stack_slices = np.arange(trj_state_stack_sz - 1 + 1)[:, None] \
                                     + np.arange(1)
                state_stacks = trj_state_stack.take(state_stack_slices, axis=0)

                sample_states[batch_idx] = state_stacks[0]
                tjs_states[tj_slice] = state_stacks[1:]

                computed_return_buffer_inds[batch_idx - retrieved_count] += idx
                computed_return_batch_inds[batch_idx -
                                           retrieved_count] += batch_idx
                tj_start_idx += self.n
                batch_idx += 1

        # We wait until the end to retrieve the q_values because it's more efficient to make only one call to
        # update_function when using a gpu.
        adjusted_batch_sz = self.batch_sz - retrieved_count
        tjs_states = np.squeeze(
            tjs_states[:adjusted_batch_sz *
                       self.n]).reshape((adjusted_batch_sz * self.n, ) +
                                        tuple(self.env_state_dims))
        tjs_qvalues = update_function(tjs_states, reshape=False).reshape(
            [adjusted_batch_sz, self.n, self.num_actions])
        tjs_actions = tjs_actions[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        tjs_rewards = tjs_rewards[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        tjs_terminations = tjs_terminations[:adjusted_batch_sz *
                                            self.n].reshape(
                                                [adjusted_batch_sz, self.n])
        tjs_timeout = tjs_timeout[:adjusted_batch_sz * self.n].reshape(
            [adjusted_batch_sz, self.n])
        tjs_bprobabilities = tjs_bprobabilities[:adjusted_batch_sz *
                                                self.n].reshape([
                                                    adjusted_batch_sz, self.n,
                                                    self.num_actions
                                                ])

        computed_return_batch_inds = computed_return_batch_inds[:self.
                                                                batch_sz -
                                                                retrieved_count]
        estimated_returns[computed_return_batch_inds] = \
            self.return_function.batch_iterative_return_function(tjs_rewards, tjs_actions, tjs_qvalues,
                                                                 tjs_terminations, tjs_timeout,
                                                                 tjs_bprobabilities,
                                                                 adjusted_batch_sz)

        computed_return_buffer_inds = computed_return_buffer_inds[:self.
                                                                  batch_sz -
                                                                  retrieved_count]
        self.estimated_return.data.put(
            indices=bf_start + computed_return_buffer_inds,
            values=estimated_returns[computed_return_batch_inds],
            mode='wrap')
        self.up_to_date.data.put(indices=bf_start +
                                 computed_return_buffer_inds,
                                 values=True,
                                 mode='wrap')
        return sample_states, sample_actions, estimated_returns

    def ready_to_sample(self):
        return self.batch_sz < (self.current_index - (self.n + 1))

    def out_of_date(self):
        self.up_to_date.data[:] = False
コード例 #6
0
class OnPolicyQSigmaExperienceReplayBuffer:
    def __init__(self, config, return_function):
        """ Parameters:
        Name:               Type:           Default:            Description: (Omitted when self-explanatory)
        buff_sz             int             10                  buffer size
        batch_sz            int             1
        frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
        env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
        num_actions         int             2                   number of actions available to the agent
        obs_dtype           np.type         np.uint8            the data type of the observations
        reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
        sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
        sigma_decay         float           1.0                 decay rate of sigma
        """
        self.config = config
        self.buff_sz = check_attribute_else_default(config, 'buff_sz', 10)
        self.batch_sz = check_attribute_else_default(config, 'batch_sz', 1)
        self.frame_stack = check_attribute_else_default(
            config, 'frame_stack', 4)
        self.env_state_dims = list(
            check_attribute_else_default(config, 'env_state_dims', [2, 2]))
        self.num_actions = check_attribute_else_default(
            config, 'num_actions', 2)
        self.obs_dtype = check_attribute_else_default(config, 'obs_dtype',
                                                      np.uint8)
        self.reward_clipping = check_attribute_else_default(
            config, 'reward_clipping', False)
        self.sigma = check_attribute_else_default(config, 'sigma', 0.5)
        self.sigma_decay = check_attribute_else_default(
            config, 'sigma_decay', 1.0)
        """ Parameters for Return Function """
        assert isinstance(return_function, OnPolicyQSigmaReturnFunction)
        self.return_function = return_function
        self.n = return_function.n
        """ Parameters to keep track of the current state of the buffer """
        self.current_index = 0
        self.full_buffer = False
        """ Circular Buffers """
        self.state = CircularBuffer(self.buff_sz,
                                    shape=tuple(self.env_state_dims),
                                    dtype=self.obs_dtype)
        self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
        self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
        self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
        self.sigma = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64)

    def store_observation(self, observation):
        assert isinstance(observation, dict)
        assert all(akey in observation.keys()
                   for akey in ["reward", "action", "state", "terminate"])
        reward = observation["reward"]
        if self.reward_clipping:
            if reward > 0: reward = 1
            elif reward < 0: reward = -1

        self.state.append(observation["state"])
        self.action.append(observation["action"])
        self.reward.append(reward)
        self.terminate.append(observation["terminate"])
        self.sigma.append(self.return_function.sigma)

        self.current_index += 1
        if self.current_index >= self.buff_sz:
            self.current_index = 0
            self.full_buffer = True

        assert hasattr(self.config, 'initial_rand_steps')
        assert hasattr(self.config, 'rand_steps_count')
        if observation[
                'terminate'] and self.config.rand_steps_count >= self.config.initial_rand_steps:
            self.return_function.adjust_sigma()

    def sample_indices(self):
        dasample = np.zeros(self.batch_sz, dtype=np.int32)
        index_number = 0
        while index_number != self.batch_sz:
            if not self.full_buffer:
                daindx = np.random.randint(self.frame_stack - 1,
                                           self.current_index - (self.n + 1))
            else:
                daindx = np.random.randint(self.frame_stack - 1,
                                           self.buff_sz - 1 - (self.n + 1))
            if not self.terminate[daindx]:
                dasample[index_number] = daindx
                index_number += 1
        return dasample

    def get_data(self, update_function):
        indices = self.sample_indices()

        sample_states = np.zeros(
            (self.batch_sz, self.frame_stack) + tuple(self.env_state_dims),
            dtype=self.obs_dtype)
        sample_actions = self.action.take(indices)

        # Abbreviations: tj = trajectory, tjs = trajectories
        tjs_states = np.zeros(
            shape=(self.batch_sz * self.n, self.frame_stack) +
            tuple(self.env_state_dims),
            dtype=self.obs_dtype)
        tjs_actions = np.zeros(self.batch_sz * self.n, np.uint8)
        tjs_rewards = np.zeros(self.batch_sz * self.n, np.int32)
        tjs_terminations = np.ones(self.batch_sz * self.n, np.bool)
        tjs_sigmas = np.ones(self.batch_sz * self.n, np.float64)

        batch_idx = 0
        tj_start_idx = 0
        tjs_slices = [None] * self.batch_sz
        for idx in indices:
            assert not self.terminate[idx]
            start_idx = idx - (self.frame_stack - 1)
            # First terminal state from the left. Reversed because we want to find the first terminal state before the
            # current state
            left_terminal_rev = self.terminate.take(
                start_idx + np.arange(self.frame_stack))[::-1]
            left_terminal_rev_idx = np.argmax(left_terminal_rev)
            left_terminal_idx = 0 if left_terminal_rev_idx == 0 else (
                self.frame_stack - 1) - left_terminal_rev_idx

            # First terminal state from center to right
            right_terminal = self.terminate.take(idx + np.arange(self.n + 1))
            right_terminal_true_idx = np.argmax(right_terminal)
            right_terminal_stop = self.n if right_terminal_true_idx == 0 else right_terminal_true_idx

            # trajectory indices
            tj_end_idx = tj_start_idx + right_terminal_stop - 1
            tj_slice = slice(tj_start_idx, tj_end_idx + 1)
            tjs_slices[batch_idx] = tj_slice
            tj_indices = idx + 1 + np.arange(right_terminal_stop)

            # Collecting: trajectory actions, rewards, terminations, bprobabilities, and sigmas
            tjs_actions[tj_slice] = self.action.take(tj_indices)
            tjs_rewards[tj_slice] = self.reward.take(tj_indices)
            tjs_terminations[tj_slice] = self.terminate.take(tj_indices)
            tjs_sigmas[tj_slice] = self.sigma.take(tj_indices)

            # Stacks of states
            trj_state_stack_sz = self.frame_stack + right_terminal_stop
            trj_state_stack = self.state.take(start_idx +
                                              np.arange(trj_state_stack_sz))
            trj_state_stack[:left_terminal_idx] *= 0

            state_stack_slices = np.arange(trj_state_stack_sz - self.frame_stack + 1)[:, None] \
                                 + np.arange(self.frame_stack)
            state_stacks = trj_state_stack.take(state_stack_slices, axis=0)

            sample_states[batch_idx] = state_stacks[0]
            tjs_states[tj_slice] = state_stacks[1:]

            tj_start_idx += self.n
            batch_idx += 1

        # We wait until the end to retrieve the q_values because it's more efficient to make only one call to
        # update_function when using a gpu.
        tjs_qvalues = update_function(
            np.squeeze(tjs_states),
            reshape=False).reshape([self.batch_sz, self.n, self.num_actions])
        tjs_actions = tjs_actions.reshape([self.batch_sz, self.n])
        tjs_rewards = tjs_rewards.reshape([self.batch_sz, self.n])
        tjs_terminations = tjs_terminations.reshape([self.batch_sz, self.n])
        tjs_sigmas = tjs_sigmas.reshape([self.batch_sz, self.n])

        estimated_returns = self.return_function.batch_iterative_return_function(
            tjs_rewards, tjs_actions, tjs_qvalues, tjs_terminations,
            tjs_sigmas, self.batch_sz)
        return sample_states, sample_actions, estimated_returns

    def ready_to_sample(self):
        return self.batch_sz < (self.current_index -
                                (self.n + self.frame_stack))
コード例 #7
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     decay_type          string          exp                 decay type of sigma. Options: exp and lin
     decay_freq          int             1                   how often to decay sigma, e.g. a decay frequency of
                                                             10 would apply the decay once very 10 episodes
     sigma_min           float           0                   the lowest value sigma can attain when decaying
     store_bprobs        bool            False               whether to store and use the behaviour policy probabilities
                                                             for the return function
     store_sigma         bool            False               whether to store sigma at every time step and use
                                                             the stored sigmas to compute the return. True = use the
                                                             sigma from the buffer, False = use the current sigma
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         self.config, 'sigma_decay', 1.0)
     self.decay_type = check_attribute_else_default(self.config,
                                                    'decay_type', 'exp')
     self.decay_freq = check_attribute_else_default(self.config,
                                                    'decay_freq', 1)
     self.sigma_min = check_attribute_else_default(self.config, 'sigma_min',
                                                   0.0)
     self.store_bprobs = check_attribute_else_default(
         self.config, 'store_bprobs', False)
     self.store_sigma = check_attribute_else_default(
         self.config, 'store_sigma', False)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, QSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Termination or Timeout Count for Applying the Decay on Sigma """
     self.episodes_since_last_decay = 0
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     if self.store_bprobs:
         self.bprobabilities = CircularBuffer(self.buff_sz,
                                              shape=(self.num_actions, ),
                                              dtype=np.float64)
     if self.store_sigma:
         self.sigma_buffer = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)