def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] last_Vs = [traj.extra_info['last_info']['V'] for traj in D] Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As.detach() entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -out['entropy_loss'] out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn_one_update(self, data): data = [d.detach().to(self.device) for d in data] observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data action_dist = self.policy(observations) logprobs = action_dist.log_prob(old_actions).squeeze() entropies = action_dist.entropy().squeeze() Vs = self.value(observations).squeeze() assert all([x.ndim == 1 for x in [logprobs, entropies, Vs]]) ratio = torch.exp(logprobs - old_logprobs) eps = self.config['agent.clip_range'] policy_loss = -torch.min( ratio * old_As, torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As) policy_loss = policy_loss.mean(0) self.policy_optimizer.zero_grad() policy_loss.backward() policy_grad_norm = nn.utils.clip_grad_norm_( self.policy.parameters(), self.config['agent.max_grad_norm']) self.policy_optimizer.step() if self.config['agent.use_lr_scheduler']: self.policy_lr_scheduler.step(self.total_timestep) clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps) value_loss = torch.max( F.mse_loss(Vs, old_Qs, reduction='none'), F.mse_loss(clipped_Vs, old_Qs, reduction='none')) value_loss = value_loss.mean(0) self.value_optimizer.zero_grad() value_loss.backward() value_grad_norm = nn.utils.clip_grad_norm_( self.value.parameters(), self.config['agent.max_grad_norm']) self.value_optimizer.step() out = {} out['policy_grad_norm'] = policy_grad_norm out['value_grad_norm'] = value_grad_norm out['policy_loss'] = policy_loss.item() out['policy_entropy'] = entropies.mean().item() out['value_loss'] = value_loss.item() out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float')) out['approx_kl'] = (old_logprobs - logprobs).mean(0).item() out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean(0).item() return out
def learn_one_update(self, data): data = [d.to(self.device) for d in data] observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data out = self.choose_action(observations) logprobs = out['action_dist'].log_prob(old_actions).squeeze() entropies = out['entropy'].squeeze() Vs = out['V'].squeeze() ratio = torch.exp(logprobs - old_logprobs) eps = self.config['agent.clip_range'] policy_loss = -torch.min( ratio * old_As, torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As) entropy_loss = -entropies clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps) value_loss = torch.max( F.mse_loss(Vs, old_Qs, reduction='none'), F.mse_loss(clipped_Vs, old_Qs, reduction='none')) loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() out = {} out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['explained_variance'] = ev(y_true=old_Qs.detach().cpu().numpy(), y_pred=Vs.detach().cpu().numpy()) out['approx_kl'] = torch.mean(old_logprobs - logprobs).item() out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean().item() return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory Ts = [len(traj) for traj in D] behavior_logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] out_agent = self.choose_action( np.concatenate([traj.numpy_observations[:-1] for traj in D], 0)) logprobs = out_agent['action_logprob'].squeeze() entropies = out_agent['entropy'].squeeze() Vs = out_agent['V'].squeeze() with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) vs, As = [], [] for traj, behavior_logprob, logprob, V, last_V in zip( D, behavior_logprobs, logprobs.detach().cpu().split(Ts), Vs.detach().cpu().split(Ts), last_Vs): v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards, V, last_V, traj.reach_terminal, self.clip_rho, self.clip_pg_rho) vs.append(v) As.append(A) # Metrics -> Tensor, device vs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [vs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, vs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(vs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out