コード例 #1
0
 def calc_nstep_advs_v_targets(self, batch):
     '''
     Calculate N-step returns advantage = nstep_returns - v_pred
     See n-step advantage under http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
     Used for training with N-step (not GAE)
     Returns 2-tuple for API-consistency with GAE
     '''
     next_v_preds = self.calc_v(batch['next_states'])
     v_preds = self.calc_v(batch['states'])
     # v targets = r_t + gamma * V(s_(t+1))
     v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1,
                                              next_v_preds)
     nstep_returns = math_util.calc_nstep_returns(batch, self.gamma,
                                                  self.num_step_returns,
                                                  next_v_preds)
     nstep_advs = nstep_returns - v_preds
     adv_targets = nstep_advs
     logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
     return adv_targets, v_targets
コード例 #2
0
 def calc_td_advs_v_targets(self, batch):
     '''
     Estimate Q(s_t, a_t) with r_t + gamma * V(s_t+1 ) for simplest AC algorithm
     '''
     next_v_preds = self.calc_v(batch['next_states'])
     # Equivalent to 1-step return
     # v targets = r_t + gamma * V(s_(t+1))
     v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1,
                                              next_v_preds)
     adv_targets = v_targets  # Plain Q estimate, called adv for API consistency
     logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
     return adv_targets, v_targets
コード例 #3
0
 def calc_gae_advs_v_targets(self, batch):
     '''
     Calculate the GAE advantages and value targets for training actor and critic respectively
     adv_targets = GAE (see math_util method)
     v_targets = adv_targets + v_preds
     before output, adv_targets is standardized (so v_targets used the unstandardized version)
     Used for training with GAE
     '''
     states = torch.cat((batch['states'], batch['next_states'][-1:]),
                        dim=0)  # prevent double-pass
     v_preds = self.calc_v(states)
     next_v_preds = v_preds[1:]  # shift for only the next states
     # v_target = r_t + gamma * V(s_(t+1)), i.e. 1-step return
     v_targets = math_util.calc_nstep_returns(batch['rewards'],
                                              batch['dones'], self.gamma, 1,
                                              next_v_preds)
     adv_targets = math_util.calc_gaes(batch['rewards'], batch['dones'],
                                       v_preds, self.gamma, self.lam)
     adv_targets = math_util.standardize(adv_targets)
     logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
     return adv_targets, v_targets
コード例 #4
0
ファイル: actor_critic.py プロジェクト: colllin/SLM-Lab
 def calc_nstep_advs_v_targets(self, batch, v_preds):
     '''
     Calculate N-step returns, and advs = nstep_rets - v_preds, v_targets = nstep_rets
     See n-step advantage under http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
     '''
     next_states = batch['next_states'][-1]
     if not self.body.env.is_venv:
         next_states = next_states.unsqueeze(dim=0)
     with torch.no_grad():
         next_v_pred = self.calc_v(next_states, use_cache=False)
     v_preds = v_preds.detach()  # adv does not accumulate grad
     if self.body.env.is_venv:
         v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs)
     nstep_rets = math_util.calc_nstep_returns(batch['rewards'], batch['dones'], next_v_pred, self.gamma, self.num_step_returns)
     advs = nstep_rets - v_preds
     v_targets = nstep_rets
     if self.body.env.is_venv:
         advs = math_util.venv_unpack(advs)
         v_targets = math_util.venv_unpack(v_targets)
     logger.debug(f'advs: {advs}\nv_targets: {v_targets}')
     return advs, v_targets
コード例 #5
0
 def calc_gae_advs_v_targets(self, batch):
     '''
     Calculate the GAE advantages and value targets for training actor and critic respectively
     adv_targets = GAE (see math_util method)
     v_targets = adv_targets + v_preds
     before output, adv_targets is standardized (so v_targets used the unstandardized version)
     Used for training with GAE
     '''
     v_preds = self.calc_v(batch['states'])
     # calc next_state boundary value and concat with above for efficiency
     next_v_pred_tail = self.calc_v(batch['next_states'][-1:])
     next_v_preds = torch.cat([v_preds[1:], next_v_pred_tail], dim=0)
     # v targets = r_t + gamma * V(s_(t+1))
     v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1,
                                              next_v_preds)
     # ensure val for next_state is 0 at done
     next_v_preds = next_v_preds * (1 - batch['dones'])
     adv_targets = math_util.calc_gaes(batch['rewards'], v_preds,
                                       next_v_preds, self.gamma, self.lam)
     adv_targets = math_util.standardize(adv_targets)
     logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
     return adv_targets, v_targets