Ejemplo n.º 1
0
 def cal_gae_adv(self, lambda_, gamma, normalize=False):
     '''
     计算GAE优势估计
     adv = td(s) + gamma * lambda * (1 - done) * td(s')
     '''
     assert 'td_error' in self.buffer.keys()
     adv = np.asarray(
         discounted_sum(self.buffer['td_error'], lambda_ * gamma, 0,
                        self.buffer['done']))
     if normalize:
         adv = standardization(adv)
     self.buffer['gae_adv'] = list(standardization(adv))
Ejemplo n.º 2
0
 def cal_gae_adv(self, lambda_, gamma, normalize=False):
     '''
     计算GAE优势估计
     adv = td(s) + gamma * lambda * (1 - done) * td(s')
     '''
     assert 'td_error' in self.data_buffer.keys(
     ), "assert 'td_error' in self.data_buffer.keys()"
     # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
     # Eq (10): delta_t = Rt + gamma*V_{t+1} - V_t
     # Eq (16): batch_adv_t = delta_t + gamma*delta_{t+1} + gamma^2*delta_{t+2} + ...
     adv = np.asarray(
         discounted_sum(self.data_buffer['td_error'], lambda_ * gamma, 0,
                        self.data_buffer['done']))
     if normalize:
         adv = standardization(adv)
     self.data_buffer['gae_adv'] = list(standardization(adv))
Ejemplo n.º 3
0
 def cal_dc_r(self, gamma, init_value, normalize=False):
     '''
     计算折扣奖励
     param gamma: 折扣因子 gamma \in [0, 1)
     param init_value: 序列最后状态的值
     '''
     dc_r = discounted_sum(self.buffer['r'], gamma, init_value,
                           self.buffer['done'])
     if normalize:
         dc_r = standardization(np.asarray(dc_r))
     self.buffer['discounted_reward'] = list(dc_r)