Ejemplo n.º 1
0
    def collect(self):
        with torch.no_grad():
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], []
            mb_states = self.states
            epinfos = []
            for _ in range(self.nsteps):
                obs = np2tentor(self.obs)
                actions, values, self.states, neglogpacs = self.ac.step(obs)
                mb_obs.append(obs)
                mb_actions.append(actions)
                mb_values.append(values)
                mb_neglogpacs.append(neglogpacs)
                mb_dones.append(np2tentor(self.dones))
                actions = tensor2np(actions)
                self.obs, rewards, self.dones, infos = self.train_env.step(
                    actions)
                for info in infos:
                    maybeepinfo = info.get('episode')
                    if maybeepinfo: epinfos.append(maybeepinfo)
                mb_rewards.append(np2tentor(rewards))
            mb_obs = torch.stack(mb_obs, dim=1)
            mb_rewards = torch.stack(mb_rewards, dim=1)
            mb_actions = torch.stack(mb_actions, dim=1)
            mb_values = torch.stack(mb_values, dim=1)
            mb_neglogpacs = torch.stack(mb_neglogpacs, dim=1)
            mb_dones = torch.stack(mb_dones, dim=1)
            last_values = self.ac.value(np2tentor(self.obs))

            # discount/bootstrap off value fn
            mb_advs = mb_rewards.new_zeros(mb_rewards.size())
            lastgaelam = mb_rewards.new_zeros(mb_rewards.size(0))
            for t in reversed(range(self.nsteps)):
                if t == self.nsteps - 1:
                    nextterminal = np2tentor(self.dones)
                    nextvalues = last_values
                else:
                    nextterminal = mb_dones[:, t + 1]
                    nextvalues = mb_values[:, t + 1]
                delta = mb_rewards[:,
                                   t] + (self.gamma * nextvalues).masked_fill_(
                                       nextterminal, 0) - mb_values[:, t]
                mb_advs[:, t] = lastgaelam = delta + (self.gamma * self.lam *
                                                      lastgaelam).masked_fill_(
                                                          nextterminal, 0)
            mb_returns = mb_advs + mb_values

            return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions,
                                mb_values, mb_neglogpacs)), mb_states, epinfos)
Ejemplo n.º 2
0
 def test_agent(self):
     for j in range(self.num_test_episodes):
         o = self.test_env.reset()
         i = 0
         while True:
             # Take deterministic actions at test time
             a = self.ac.act(np2tentor(o), deterministic=True)
             o, r, d, infos = self.test_env.step(tensor2np(a))
             for info in infos:
                 maybeepinfo = info.get('episode')
                 if maybeepinfo:
                     logger.logkv_mean('eprewmean', maybeepinfo['r'])
                     logger.logkv_mean('eplenmean', maybeepinfo['l'])
                     i += 1
                     if i == 10:
                         return
Ejemplo n.º 3
0
def action4env(action):
    if isinstance(action, torch.Tensor):
        return tensor2np(action)
    return action