def process_rollout(self, rollout, gamma, lambda_=1.0):
        """
    given a rollout, compute its returns and the advantage
    """
        batch_si = np.asarray(rollout.states)
        batch_a = np.asarray(rollout.actions)
        rewards = np.asarray(rollout.rewards)
        time = np.asarray(rollout.time)
        meta = np.asarray(rollout.meta)
        vpred_t = np.asarray(rollout.values + [rollout.r])

        rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
        batch_r = util.discount(rewards_plus_v, gamma)[:-1]
        delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
        # this formula for the advantage comes "Generalized Advantage Estimation":
        # https://arxiv.org/abs/1506.02438
        batch_adv = util.discount(delta_t, gamma * lambda_)

        features = rollout.features[0]
        return util.Batch(si=batch_si,
                          a=batch_a,
                          adv=batch_adv,
                          r=batch_r,
                          terminal=rollout.terminal,
                          features=features,
                          reward=rewards,
                          step=time,
                          meta=meta)
Esempio n. 2
0
    def sample(self, length):
        size = len(self.s)
        is_initial_state = False
        if self.sampling == 'rand':
            idx = np.random.randint(0, size-1)
            if self.term[idx]:
                return self.sample(length)
            for end_idx in range(idx, idx + length):
                if self.term[end_idx] or end_idx == size-1:
                    break
            is_initial_state = (idx > 0 and self.term[idx-1]) or idx == 0
        else:
            idx = self.sample_idx
            if self.term[idx]:
                idx = idx + 1
            for end_idx in range(idx, idx + length):
                if self.term[end_idx] or end_idx == size-1:
                    break
            self.sample_idx = end_idx + 1 if end_idx < size-1 else 0
            is_initial_state = (idx > 0 and self.term[idx-1]) or idx == 0

        assert end_idx == idx + length - 1 or self.term[end_idx] or end_idx == size-1
        return util.Batch(si=np.asarray(self.s[idx:end_idx+1]), 
              a=np.asarray(self.a[idx:end_idx+1]), 
              adv=None,
              r=None, 
              terminal=self.term,
              features=[],
              reward=np.asarray(self.r[idx:end_idx+1]),
              step=np.asarray(self.t[idx:end_idx+1]),
              meta=np.asarray(self.r_t[idx:end_idx+1])), is_initial_state
Esempio n. 3
0
    def process_rollout(self, rollout, gamma, lambda_=1.0):
        """
    given a rollout, compute its returns
    """
	#print ("shape of the roolout states:--------------")
	#print (len(rollout.states))
	#print ((rollout.states[0]).shape)
        batch_si = np.asarray(rollout.states)
        batch_a = np.asarray(rollout.actions)
        rewards = np.asarray(rollout.rewards)
        time = np.asarray(rollout.time)
        meta = np.asarray(rollout.meta)
        rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
        batch_r = util.discount(rewards_plus_v, gamma, time)
        features = rollout.features[0]

        return util.Batch(si=batch_si, 
              a=batch_a, 
              adv=None,
              r=batch_r, 
              terminal=rollout.terminal,
              features=features,
              reward=rewards,
              step=time,
              meta=meta,
              )