コード例 #1
0
 def test_agent(self):
     for j in range(self.num_test_episodes):
         o = self.test_env.reset()
         i = 0
         while True:
             # Take deterministic actions at test time
             a = self.ac.act(np2tentor(o), deterministic=True)
             o, r, d, infos = self.test_env.step(tensor2np(a))
             for info in infos:
                 maybeepinfo = info.get('episode')
                 if maybeepinfo:
                     logger.logkv_mean('eprewmean', maybeepinfo['r'])
                     logger.logkv_mean('eplenmean', maybeepinfo['l'])
                     i += 1
                     if i == 10:
                         return
コード例 #2
0
    def update(self):
        for _ in range(self.update_every):
            self.optimizer.zero_grad()

            data = self.buffer.sample_batch(self.batch_size)
            o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
                'obs2'], data['done']

            # compute loss q
            q1 = self.ac.q1(o, a)
            q2 = self.ac.q2(o, a)

            with torch.no_grad():
                a2, logpa2 = self.ac.step(o2)
                q1_targ = self.ac_targ.q1(o2, a2)
                q2_targ = self.ac_targ.q2(o2, a2)
                q_targ = torch.min(q1_targ, q2_targ)
                backup = r + (self.gamma *
                              (q_targ - self.alpha * logpa2)).masked_fill(
                                  d, 0.)
            loss_q1 = torch.mean(torch.square(q1 - backup))
            loss_q2 = torch.mean(torch.square(q2 - backup))
            loss_q = loss_q1 + loss_q2
            loss_q.backward()
            # compute loss pi
            for p in self.ac.q1.parameters():
                p.requires_grad = False
            for p in self.ac.q2.parameters():
                p.requires_grad = False
            pi, logp_pi = self.ac.step(o)
            q1_pi = self.ac.q1(o, pi)
            q2_pi = self.ac.q2(o, pi)
            q_pi = torch.min(q1_pi, q2_pi)
            loss_pi = torch.mean((self.alpha * logp_pi - q_pi))
            loss_pi.backward()
            self.optimizer.step()
            for p in self.ac.q1.parameters():
                p.requires_grad = True
            for p in self.ac.q2.parameters():
                p.requires_grad = True
            logger.logkv_mean('loss_q', loss_q.item())
            logger.logkv_mean('loss_pi', loss_pi.item())
            with torch.no_grad():
                for p, p_targ in zip(self.ac.parameters(),
                                     self.ac_targ.parameters()):
                    p_targ.data.mul_(self.polyak)
                    p_targ.data.add_((1 - self.polyak) * p.data)
コード例 #3
0
    def train(self):

        o = self.train_env.reset()
        first_tstart = time.perf_counter()
        for _epoch in range(self._epoch, self.total_epoch):
            tstart = time.perf_counter()
            for _t in range(self.nsteps):

                if self._t > self.start_steps:
                    a = self.ac.act(np2tentor(o))
                    a = action4env(a)
                else:
                    a = np.concatenate([
                        self.train_env.action_space.sample().reshape(1, -1)
                        for _ in range(self.nenv)
                    ],
                                       axis=0)
                o2, r, d, infos = self.train_env.step(a)
                self.buffer.store(o, a, r, o2, d)
                o = o2

                for info in infos:
                    maybeepinfo = info.get('episode')
                    if maybeepinfo:
                        logger.logkv_mean('eprewtrain', maybeepinfo['r'])
                        logger.logkv_mean('eplentrain', maybeepinfo['l'])

                self._t += 1
                if self._t >= self.update_after and self._t % self.update_every == 0:
                    self.update()
                if self._t > self.n_timesteps:
                    break

            fps = int((_t + 1) / (time.perf_counter() - tstart))

            if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1):
                self.test_agent()
                logger.logkv('epoch', _epoch)
                logger.logkv('lr', self.optimizer.param_groups[0]['lr'])
                logger.logkv('timesteps', self._t)
                logger.logkv('fps', fps)
                logger.logkv('time_elapsed',
                             time.perf_counter() - first_tstart)
                logger.dump_tabular()
                self._epoch = _epoch
                # self.save_model()
            self.lr_scheduler.step()
コード例 #4
0
    def update(self):
        for _ in range(self.update_every):
            self.total_it+=1
            self.optimizer.zero_grad()

            data=self.buffer.sample_batch(self.batch_size)
            o,a,r,o2,d=data['obs'],data['act'],data['rew'],data['obs2'],data['done']
            #update q
            q1=self.ac.q1(o,a)
            q2=self.ac.q2(o,a)
            with torch.no_grad():
                a_next,logprob=self.ac_targ.step(o2)
                noise =torch.clamp( torch.normal(0., 0.2, (self.batch_size, self.train_env.action_space.shape[-1])).cuda(),-0.5,0.5)
                a_next=torch.clamp(a_next+noise,self.act_range[0],self.act_range[1])*self.act_scale
                #a_next=torch.clamp(a_next+torch.clamp(self.act_update_noise,-0.5,0.5),self.act_range[0],self.act_range[1])*self.act_scale
                q1_targ=self.ac_targ.q1(o2,a_next)
                q2_targ=self.ac_targ.q2(o2,a_next)
                q_targ=torch.min(q1_targ,q2_targ)
                backup=r+(self.gamma*q_targ).masked_fill(d,0.)
            loss_q1=torch.mean(torch.square(q1-backup))
            loss_q2=torch.mean(torch.square(q2-backup))
            loss_q=loss_q1+loss_q2
            loss_q.backward()

            #update pi
            if self.total_it%self.policy_delay==0:
                for p in self.ac.q1.parameters():
                    p.requires_grad = False
                for p in self.ac.q2.parameters():
                    p.requires_grad = False
                pi, logp_pi = self.ac.step(o)
                q1_pi = self.ac.q1(o, pi)
                loss_pi = torch.mean(-q1_pi)
                loss_pi.backward()
                self.optimizer.step()

                for p in self.ac.q1.parameters():
                    p.requires_grad = True
                for p in self.ac.q2.parameters():
                    p.requires_grad = True
                logger.logkv_mean('loss_q', loss_q.item())
                logger.logkv_mean('loss_pi', loss_pi.item())
                with torch.no_grad():
                    for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()):
                        p_targ.data.mul_(self.polyak)
                        p_targ.data.add_((1 - self.polyak) * p.data)
コード例 #5
0
    def update(self, obs, returns, masks, actions, values, neglogpacs,
               clip_ratio, states):
        if states is None:
            for _ in range(self.noptepochs):
                permutation = np.random.permutation(self.nbatch)
                for start in range(0, self.nbatch, self.nbatch_train):
                    end = start + self.nbatch_train
                    mbinds = permutation[start:end]
                    mbinds = np2tentor(mbinds)
                    obs_mnb, returns_mnb, masks_mnb, actions_mnb, values_mnb, neglogpacs_mnb = \
                        (arr.index_select(0, mbinds) for arr in (obs, returns, masks, actions, values, neglogpacs))
                    advs_mnb = returns_mnb - values_mnb
                    advs_mnb = (advs_mnb - advs_mnb.mean()) / (advs_mnb.std() +
                                                               1e-8)
                    dist, v_now = self.ac(obs_mnb)
                    neglogpacs_now = self.ac.neglogprob(dist, actions_mnb)
                    entropy = dist.entropy().mean()

                    clip_v = values_mnb + torch.clamp(v_now - values_mnb,
                                                      -clip_ratio, clip_ratio)
                    vf_loss1 = torch.square(v_now - returns_mnb)
                    vf_loss2 = torch.square(clip_v - returns_mnb)
                    vf_loss = 0.5 * torch.mean(torch.max(vf_loss1, vf_loss2))

                    ratio = torch.exp(neglogpacs_mnb - neglogpacs_now)
                    pg_losses1 = -advs_mnb * ratio
                    pg_losses2 = -advs_mnb * torch.clamp(
                        ratio, 1 - clip_ratio, 1 + clip_ratio)
                    pg_losses = torch.mean(torch.max(pg_losses1, pg_losses2))
                    with torch.no_grad():
                        approxkl = torch.mean(neglogpacs_now - neglogpacs_mnb)
                        clipfrac = torch.mean(
                            (torch.abs(ratio - 1.) > clip_ratio).float())

                    loss = pg_losses - entropy * self.ent_coef + vf_loss * self.vf_coef
                    self.optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.ac.parameters(),
                                                   self.max_grad_norm)
                    self.optimizer.step()

                    logger.logkv_mean('total_loss', loss.item())
                    logger.logkv_mean('pg_loss', loss.item())
                    logger.logkv_mean('entropy', entropy.item())
                    logger.logkv_mean('vf_loss', vf_loss.item())
                    logger.logkv_mean('approxkl', approxkl.item())
                    logger.logkv_mean('clipfrac', clipfrac.item())