Esempio n. 1
0
 def compute_returns(self, T):
     """e.g. if 2-step return, t-1 is first return written here, using reward
     at t-1 and new reward at t (up through t-1+T from t+T)."""
     if self.n_step_return == 1:
         return  # return = reward, done_n = done
     t, s = self.t, self.samples
     nm1 = self.n_step_return - 1
     if t - nm1 >= 0 and t + T <= self.T:  # No wrap (operate in-place).
         reward = s.reward[t - nm1:t + T]
         done = s.done[t - nm1:t + T]
         return_dest = self.samples_return_[t - nm1:t - nm1 + T]
         done_n_dest = self.samples_done_n[t - nm1:t - nm1 + T]
         discount_return_n_step(reward,
                                done,
                                n_step=self.n_step_return,
                                discount=self.discount,
                                return_dest=return_dest,
                                done_n_dest=done_n_dest)
     else:  # Wrap (copies); Let it (wrongly) wrap at first call.
         idxs = np.arange(t - nm1, t + T) % T
         reward = s.reward[idxs]
         done = s.done[idxs]
         dest_idxs = idxs[:-nm1]
         return_, done_n = discount_return_n_step(reward,
                                                  done,
                                                  n_step=self.n_step_return,
                                                  discount=self.discount)
         self.samples_return_[dest_idxs] = return_
         self.samples_done_n[dest_idxs] = done_n
Esempio n. 2
0
 def compute_returns(self, T):
     """Compute the n-step returns using the new rewards just written into
     the buffer, but before the buffer cursor is advanced.  Input ``T`` is
     the number of new timesteps which were just written.
     Does nothing if `n-step==1`. e.g. if 2-step return, t-1
     is first return written here, using reward at t-1 and new reward at t
     (up through t-1+T from t+T)."""
     if self.n_step_return == 1:
         return  # return = reward, done_n = done
     t, s = self.t, self.samples
     nm1 = self.n_step_return - 1
     if t - nm1 >= 0 and t + T <= self.T:  # No wrap (operate in-place).
         reward = s.reward[t - nm1:t + T]
         done = s.done[t - nm1:t + T]
         return_dest = self.samples_return_[t - nm1:t - nm1 + T]
         done_n_dest = self.samples_done_n[t - nm1:t - nm1 + T]
         discount_return_n_step(
             reward,
             done,
             n_step=self.n_step_return,
             discount=self.discount,
             return_dest=return_dest,
             done_n_dest=done_n_dest,
         )
     else:  # Wrap (copies); Let it (wrongly) wrap at first call.
         idxs = np.arange(t - nm1, t + T) % self.T
         reward = s.reward[idxs]
         done = s.done[idxs]
         dest_idxs = idxs[:-nm1]
         return_, done_n = discount_return_n_step(reward,
                                                  done,
                                                  n_step=self.n_step_return,
                                                  discount=self.discount)
         self.samples_return_[dest_idxs] = return_
         self.samples_done_n[dest_idxs] = done_n
Esempio n. 3
0
    def compute_ul_returns(self, T):
        """Compute the n-step returns using the new rewards just written into
        the buffer, but before the buffer cursor is advanced.  Input ``T`` is
        the number of new timesteps which were just written.
        Does nothing if `n-step==1`. e.g. if 2-step return, t-1
        is first return written here, using reward at t-1 and new reward at t
        (up through t-1+T from t+T).]

        Use ABSOLUTE VALUE of rewards...it's all good signal for prioritization.
        """
        t, nm1 = self.replay_buffer.t, self.n_step_return - 1
        if self.n_step_return == 1:
            idxs = np.arange(t - nm1, t + T) % self.replay_buffer.T
            return_ = np.abs(self.samples_reward[idxs])
            return return_  # return = reward, done_n = done
        if (
            t - nm1 >= 0 and t + T <= self.replay_buffer.T
        ):  # No wrap (operate in-place).
            reward = np.abs(self.samples_reward[t - nm1 : t + T])
            done = self.samples_done[t - nm1 : t + T]
            return_dest = self.samples_return_[t - nm1 : t - nm1 + T]
            done_n_dest = self.samples_done_n[t - nm1 : t - nm1 + T]
            discount_return_n_step(
                reward,
                done,
                n_step=self.n_step_return,
                discount=self.replay_buffer.discount,
                return_dest=return_dest,
                done_n_dest=done_n_dest,
            )
            return return_dest.copy()
        else:  # Wrap (copies); Let it (wrongly) wrap at first call.
            idxs = np.arange(t - nm1, t + T) % self.replay_buffer.T
            reward = np.abs(self.samples_reward[idxs])
            done = self.samples_done[idxs]
            dest_idxs = idxs[:-nm1]
            return_, done_n = discount_return_n_step(
                reward,
                done,
                n_step=self.n_step_return,
                discount=self.replay_buffer.discount,
            )
            self.samples_return_[dest_idxs] = return_
            self.samples_done_n[dest_idxs] = done_n
            return return_
Esempio n. 4
0
File: r2d1.py Progetto: zikkat/rlpyt
 def compute_input_priorities(self, samples):
     """Just for first input into replay buffer.
     Simple 1-step return TD-errors using recorded Q-values from online
     network and value scaling, with the T dimension reduced away (same
     priority applied to all samples in this batch; whereever the rnn state
     is kept--hopefully the first step--this priority will apply there).
     The samples duration T might be less than the training segment, so
     this is an approximation of an approximation, but hopefully will
     capture the right behavior.
     UPDATE 20190826: Trying using n-step returns.  For now using samples
     with full n-step return available...later could also use partial
     returns for samples at end of batch.  35/40 ain't bad tho.
     Might not carry/use internal state here, because might get executed
     by alternating memory copiers in async mode; do all with only the 
     samples avialable from input."""
     samples = torchify_buffer(samples)
     q = samples.agent.agent_info.q
     action = samples.agent.action
     q_max = torch.max(q, dim=-1).values
     q_at_a = select_at_indexes(action, q)
     return_n, done_n = discount_return_n_step(
         reward=samples.env.reward,
         done=samples.env.done,
         n_step=self.n_step_return,
         discount=self.discount,
         do_truncated=False,  # Only samples with full n-step return.
     )
     # y = self.value_scale(
     #     samples.env.reward[:-1] +
     #     (self.discount * (1 - samples.env.done[:-1].float()) *  # probably done.float()
     #         self.inv_value_scale(q_max[1:]))
     # )
     nm1 = max(1,
               self.n_step_return - 1)  # At least 1 bc don't have next Q.
     y = self.value_scale(return_n + (1 - done_n.float()) *
                          self.inv_value_scale(q_max[nm1:]))
     delta = abs(q_at_a[:-nm1] - y)
     # NOTE: by default, with R2D1, use squared-error loss, delta_clip=None.
     if self.delta_clip is not None:  # Huber loss.
         delta = torch.clamp(delta, 0, self.delta_clip)
     valid = valid_from_done(samples.env.done[:-nm1])
     max_d = torch.max(delta * valid, dim=0).values
     mean_d = valid_mean(delta, valid, dim=0)  # Still high if less valid.
     priorities = self.pri_eta * max_d + (1 - self.pri_eta) * mean_d  # [B]
     return priorities.numpy()
Esempio n. 5
0
    def compute_input_priorities(self, samples):
        """Used when putting new samples into the replay buffer.  Computes
        n-step TD-errors using recorded Q-values from online network and
        value scaling.  Weights the max and the mean TD-error over each sequence
        to make a single priority value for that sequence.  

        Note:
            Although the original R2D2 implementation used the entire
            80-step sequence to compute the input priorities, we ran R2D1 with 40
            time-step sample batches, and so computed the priority for each
            80-step training sequence based on one of the two 40-step halves.
            Algorithm argument ``input_priority_shift`` determines which 40-step
            half is used as the priority for the 80-step sequence.  (Since this 
            method might get executed by alternating memory copiers in async mode,
            don't carry internal state here, do all computation with only the samples
            available in input.  Could probably reduce to one memory copier and keep
            state there, if needed.)
        """

        # """Just for first input into replay buffer.
        # Simple 1-step return TD-errors using recorded Q-values from online
        # network and value scaling, with the T dimension reduced away (same
        # priority applied to all samples in this batch; whereever the rnn state
        # is kept--hopefully the first step--this priority will apply there).
        # The samples duration T might be less than the training segment, so
        # this is an approximation of an approximation, but hopefully will
        # capture the right behavior.
        # UPDATE 20190826: Trying using n-step returns.  For now using samples
        # with full n-step return available...later could also use partial
        # returns for samples at end of batch.  35/40 ain't bad tho.
        # Might not carry/use internal state here, because might get executed
        # by alternating memory copiers in async mode; do all with only the
        # samples avialable from input."""
        samples = torchify_buffer(samples)
        q = samples.agent.agent_info.q
        action = samples.agent.action
        q_max = torch.max(q, dim=-1).values
        q_at_a = select_at_indexes(action, q)
        return_n, done_n = discount_return_n_step(
            reward=samples.env.reward,
            done=samples.env.done,
            n_step=self.n_step_return,
            discount=self.discount,
            do_truncated=False,  # Only samples with full n-step return.
        )
        # y = self.value_scale(
        #     samples.env.reward[:-1] +
        #     (self.discount * (1 - samples.env.done[:-1].float()) *  # probably done.float()
        #         self.inv_value_scale(q_max[1:]))
        # )
        nm1 = max(1, self.n_step_return - 1)  # At least 1 bc don't have next Q.
        y = self.value_scale(return_n +
            (1 - done_n.float()) * self.inv_value_scale(q_max[nm1:]))
        delta = abs(q_at_a[:-nm1] - y)
        # NOTE: by default, with R2D1, use squared-error loss, delta_clip=None.
        if self.delta_clip is not None:  # Huber loss.
            delta = torch.clamp(delta, 0, self.delta_clip)
        valid = valid_from_done(samples.env.done[:-nm1])
        max_d = torch.max(delta * valid, dim=0).values
        mean_d = valid_mean(delta, valid, dim=0)  # Still high if less valid.
        priorities = self.pri_eta * max_d + (1 - self.pri_eta) * mean_d  # [B]
        return priorities.numpy()