def test_agent(self): for j in range(self.num_test_episodes): o = self.test_env.reset() i = 0 while True: # Take deterministic actions at test time a = self.ac.act(np2tentor(o), deterministic=True) o, r, d, infos = self.test_env.step(tensor2np(a)) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: logger.logkv_mean('eprewmean', maybeepinfo['r']) logger.logkv_mean('eplenmean', maybeepinfo['l']) i += 1 if i == 10: return
def update(self): for _ in range(self.update_every): self.optimizer.zero_grad() data = self.buffer.sample_batch(self.batch_size) o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # compute loss q q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) with torch.no_grad(): a2, logpa2 = self.ac.step(o2) q1_targ = self.ac_targ.q1(o2, a2) q2_targ = self.ac_targ.q2(o2, a2) q_targ = torch.min(q1_targ, q2_targ) backup = r + (self.gamma * (q_targ - self.alpha * logpa2)).masked_fill( d, 0.) loss_q1 = torch.mean(torch.square(q1 - backup)) loss_q2 = torch.mean(torch.square(q2 - backup)) loss_q = loss_q1 + loss_q2 loss_q.backward() # compute loss pi for p in self.ac.q1.parameters(): p.requires_grad = False for p in self.ac.q2.parameters(): p.requires_grad = False pi, logp_pi = self.ac.step(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) loss_pi = torch.mean((self.alpha * logp_pi - q_pi)) loss_pi.backward() self.optimizer.step() for p in self.ac.q1.parameters(): p.requires_grad = True for p in self.ac.q2.parameters(): p.requires_grad = True logger.logkv_mean('loss_q', loss_q.item()) logger.logkv_mean('loss_pi', loss_pi.item()) with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data)
def train(self): o = self.train_env.reset() first_tstart = time.perf_counter() for _epoch in range(self._epoch, self.total_epoch): tstart = time.perf_counter() for _t in range(self.nsteps): if self._t > self.start_steps: a = self.ac.act(np2tentor(o)) a = action4env(a) else: a = np.concatenate([ self.train_env.action_space.sample().reshape(1, -1) for _ in range(self.nenv) ], axis=0) o2, r, d, infos = self.train_env.step(a) self.buffer.store(o, a, r, o2, d) o = o2 for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: logger.logkv_mean('eprewtrain', maybeepinfo['r']) logger.logkv_mean('eplentrain', maybeepinfo['l']) self._t += 1 if self._t >= self.update_after and self._t % self.update_every == 0: self.update() if self._t > self.n_timesteps: break fps = int((_t + 1) / (time.perf_counter() - tstart)) if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1): self.test_agent() logger.logkv('epoch', _epoch) logger.logkv('lr', self.optimizer.param_groups[0]['lr']) logger.logkv('timesteps', self._t) logger.logkv('fps', fps) logger.logkv('time_elapsed', time.perf_counter() - first_tstart) logger.dump_tabular() self._epoch = _epoch # self.save_model() self.lr_scheduler.step()
def update(self): for _ in range(self.update_every): self.total_it+=1 self.optimizer.zero_grad() data=self.buffer.sample_batch(self.batch_size) o,a,r,o2,d=data['obs'],data['act'],data['rew'],data['obs2'],data['done'] #update q q1=self.ac.q1(o,a) q2=self.ac.q2(o,a) with torch.no_grad(): a_next,logprob=self.ac_targ.step(o2) noise =torch.clamp( torch.normal(0., 0.2, (self.batch_size, self.train_env.action_space.shape[-1])).cuda(),-0.5,0.5) a_next=torch.clamp(a_next+noise,self.act_range[0],self.act_range[1])*self.act_scale #a_next=torch.clamp(a_next+torch.clamp(self.act_update_noise,-0.5,0.5),self.act_range[0],self.act_range[1])*self.act_scale q1_targ=self.ac_targ.q1(o2,a_next) q2_targ=self.ac_targ.q2(o2,a_next) q_targ=torch.min(q1_targ,q2_targ) backup=r+(self.gamma*q_targ).masked_fill(d,0.) loss_q1=torch.mean(torch.square(q1-backup)) loss_q2=torch.mean(torch.square(q2-backup)) loss_q=loss_q1+loss_q2 loss_q.backward() #update pi if self.total_it%self.policy_delay==0: for p in self.ac.q1.parameters(): p.requires_grad = False for p in self.ac.q2.parameters(): p.requires_grad = False pi, logp_pi = self.ac.step(o) q1_pi = self.ac.q1(o, pi) loss_pi = torch.mean(-q1_pi) loss_pi.backward() self.optimizer.step() for p in self.ac.q1.parameters(): p.requires_grad = True for p in self.ac.q2.parameters(): p.requires_grad = True logger.logkv_mean('loss_q', loss_q.item()) logger.logkv_mean('loss_pi', loss_pi.item()) with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data)
def update(self, obs, returns, masks, actions, values, neglogpacs, clip_ratio, states): if states is None: for _ in range(self.noptepochs): permutation = np.random.permutation(self.nbatch) for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = permutation[start:end] mbinds = np2tentor(mbinds) obs_mnb, returns_mnb, masks_mnb, actions_mnb, values_mnb, neglogpacs_mnb = \ (arr.index_select(0, mbinds) for arr in (obs, returns, masks, actions, values, neglogpacs)) advs_mnb = returns_mnb - values_mnb advs_mnb = (advs_mnb - advs_mnb.mean()) / (advs_mnb.std() + 1e-8) dist, v_now = self.ac(obs_mnb) neglogpacs_now = self.ac.neglogprob(dist, actions_mnb) entropy = dist.entropy().mean() clip_v = values_mnb + torch.clamp(v_now - values_mnb, -clip_ratio, clip_ratio) vf_loss1 = torch.square(v_now - returns_mnb) vf_loss2 = torch.square(clip_v - returns_mnb) vf_loss = 0.5 * torch.mean(torch.max(vf_loss1, vf_loss2)) ratio = torch.exp(neglogpacs_mnb - neglogpacs_now) pg_losses1 = -advs_mnb * ratio pg_losses2 = -advs_mnb * torch.clamp( ratio, 1 - clip_ratio, 1 + clip_ratio) pg_losses = torch.mean(torch.max(pg_losses1, pg_losses2)) with torch.no_grad(): approxkl = torch.mean(neglogpacs_now - neglogpacs_mnb) clipfrac = torch.mean( (torch.abs(ratio - 1.) > clip_ratio).float()) loss = pg_losses - entropy * self.ent_coef + vf_loss * self.vf_coef self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.ac.parameters(), self.max_grad_norm) self.optimizer.step() logger.logkv_mean('total_loss', loss.item()) logger.logkv_mean('pg_loss', loss.item()) logger.logkv_mean('entropy', entropy.item()) logger.logkv_mean('vf_loss', vf_loss.item()) logger.logkv_mean('approxkl', approxkl.item()) logger.logkv_mean('clipfrac', clipfrac.item())