def collect(self): with torch.no_grad(): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] mb_states = self.states epinfos = [] for _ in range(self.nsteps): obs = np2tentor(self.obs) actions, values, self.states, neglogpacs = self.ac.step(obs) mb_obs.append(obs) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(np2tentor(self.dones)) actions = tensor2np(actions) self.obs, rewards, self.dones, infos = self.train_env.step( actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(np2tentor(rewards)) mb_obs = torch.stack(mb_obs, dim=1) mb_rewards = torch.stack(mb_rewards, dim=1) mb_actions = torch.stack(mb_actions, dim=1) mb_values = torch.stack(mb_values, dim=1) mb_neglogpacs = torch.stack(mb_neglogpacs, dim=1) mb_dones = torch.stack(mb_dones, dim=1) last_values = self.ac.value(np2tentor(self.obs)) # discount/bootstrap off value fn mb_advs = mb_rewards.new_zeros(mb_rewards.size()) lastgaelam = mb_rewards.new_zeros(mb_rewards.size(0)) for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextterminal = np2tentor(self.dones) nextvalues = last_values else: nextterminal = mb_dones[:, t + 1] nextvalues = mb_values[:, t + 1] delta = mb_rewards[:, t] + (self.gamma * nextvalues).masked_fill_( nextterminal, 0) - mb_values[:, t] mb_advs[:, t] = lastgaelam = delta + (self.gamma * self.lam * lastgaelam).masked_fill_( nextterminal, 0) mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, epinfos)
def test_agent(self): for j in range(self.num_test_episodes): o = self.test_env.reset() i = 0 while True: # Take deterministic actions at test time a = self.ac.act(np2tentor(o), deterministic=True) o, r, d, infos = self.test_env.step(tensor2np(a)) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: logger.logkv_mean('eprewmean', maybeepinfo['r']) logger.logkv_mean('eplenmean', maybeepinfo['l']) i += 1 if i == 10: return
def action4env(action): if isinstance(action, torch.Tensor): return tensor2np(action) return action