コード例 #1
0
    def finish_path(self, last_val=0):
        """
        Call this at the end of a trajectory, or when one gets cut off
        by an epoch ending. This looks back in the buffer to where the
        trajectory started, and uses rewards and value estimates from
        the whole trajectory to compute advantage estimates with GAE-Lambda,
        as well as compute the rewards-to-go for each state, to use as
        the targets for the value function.

        The "last_val" argument should be 0 if the trajectory ended
        because the agent reached a terminal state (died), and otherwise
        should be V(s_T), the value function estimated for the last state.
        This allows us to bootstrap the reward-to-go calculation to account
        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
        """

        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)

        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = core.discount_cumsum(
            deltas, self.gamma * self.lam)

        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]

        self.path_start_idx = self.ptr
コード例 #2
0
 def culculate_adv_buf(self):
     for path_slice in self.slicelist:
         rews = np.insert(self.rew_buf[path_slice],0,0)#insert first zero reward 
         vals = np.append(self.val_buf[path_slice], rews[-1])#insert last value
 
         # the next two lines implement GAE-Lambda advantage calculation
         deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
         self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
         
         # the next line computes rewards-to-go, to be targets for the value function
         self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
コード例 #3
0
  def finish_path(self, last_val=0):
    """
    "Call this at the end of a trajectory, or when one gets cut off
    by an epoch ending.  This looks back in the buffer to where the
    trajectory started, and uses rewards and value etimates from
    the whole trajectory to compute advantage estimates with GAE-Lambda,
    as well as compute the rewards-to-go for each state, to use as
    the targets for the value function.

    The 'last_val' argument should be 0 if the trajectory ended
    because the agent reached a terminal state (died), and otherwise
    should be V(s_T), the value function estimated for the last state.
    This allows us to bootstrap the reward-to-go calculation to account
    for timesteps beyond the arbitrary episode horizon (or epoch cutoff)."
    -OpenAI
    """
    # a slice object which can be used to index into arrays
    # to get the relevant data for the current path
    path_slice = slice(self.path_start_idx, self.ptr)

    # rewards & vals for this path, including the value estimate
    # of the final state in the path [0 if it is a terminal state]
    rews = np.append(self.rew_buf[path_slice], last_val)
    vals = np.append(self.val_buf[path_slice], last_val)

    # "the next two lines implement GAE-Lambda" advantage calculation" -OpenAI

    # okay, I'm gonna put in some comments to explain what is going on here.

    # first we calculate 'deltas'.  This is an array such that
    # deltas[i] = (rews[i] + gamma * vals[i + 1]) - vals[i]
    # in other words, deltas[i] is the difference between our 
    # new estimate of the value of the state[i], and 
    # the previous value function's estimate of the value of state i
    # 
    # our new estimate of value[state[i]]
    # is based on the reward achieved at time step i, plus our old value
    # function's estimate of the value of state[i + 1]

    # so deltas[i] is an estimate of how much better the trajectory
    # was than the expected values for the current policy.
    # Note that this estimate is based ONLY ON ONE REWARD,
    # namely that received at time step i.
    deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]

    # then we determine the "advantage" for each of these times
    # functionally, adv_buf[i] is similar to deltas[i] in that
    # it provides a measure of how much better this trajectory was
    # than that expected under the given policy.
    # however, this measure of advantage integrates information
    # from the entire trajectory, rather than just from a single reward
    
    # adv_buf[i] = delta_i + gamma * lambda * delta_(i + 1) + (gamma * lambda)^2 * delta_(i + 2) + ...
    # in other words, it is a sum of the delta value throughout the future of this trajectory,
    # there the term of delta[i + k] is discounted by (gamma * lambda)^k = lambda^k * gamma^k
    # Discounting by gamma^k is expected, since delta[i + k] occurs k time steps in the future.
    # we discount by lambda^k based on the approximation that the the further in the future
    # we go, the less influence the current state has on the reward recieved.
    # in other words, we have the lambda term so that we consider advantage observed very far
    # in the future to have less to do with present actions than advantage observed in the short term
    self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)

    # TODO: consider integrating eligibility traces into this implementation
    # this may already be done in some form which I just haven't understood yet,
    # and it may be that the format of the algorithm doesn't allow for it to be
    # done in any useful way, but this is worth coming back to. -george 2019-02-16

    # fill in the return buffer with the discounted sum of the rewards
    # recieved during this path
    self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]

    # advance the path_start_idx so it points to the beginning
    # of the next path which will be put into the buffer
    self.path_start_idx = self.ptr