def test_bootstrapped_returns(gamma, last_V): y = [ 0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V))), 0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V)), 0.3 + gamma * (0.4 + gamma * last_V), 0.4 + gamma * last_V ] reach_terminal = False rewards = [0.1, 0.2, 0.3, 0.4] assert np.allclose( bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y) assert np.allclose( bootstrapped_returns(gamma, rewards, torch.tensor(last_V), reach_terminal), y) y = [ 0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V * 0.0))), 0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V * 0.0)), 0.3 + gamma * (0.4 + gamma * last_V * 0.0), 0.4 + gamma * last_V * 0.0 ] reach_terminal = True rewards = [0.1, 0.2, 0.3, 0.4] assert np.allclose( bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y) assert np.allclose( bootstrapped_returns(gamma, rewards, torch.tensor(last_V), reach_terminal), y) y = [ 0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V)))), 0.2 + gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V))), 0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V)), 0.4 + gamma * (0.5 + gamma * last_V), 0.5 + gamma * last_V ] reach_terminal = False rewards = [0.1, 0.2, 0.3, 0.4, 0.5] assert np.allclose( bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y) assert np.allclose( bootstrapped_returns(gamma, rewards, torch.tensor(last_V), reach_terminal), y) y = [ 0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V * 0.0)))), 0.2 + gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V * 0.0))), 0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V * 0.0)), 0.4 + gamma * (0.5 + gamma * last_V * 0.0), 0.5 + gamma * last_V * 0.0 ] reach_terminal = True rewards = [0.1, 0.2, 0.3, 0.4, 0.5] assert np.allclose( bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y) assert np.allclose( bootstrapped_returns(gamma, rewards, torch.tensor(last_V), reach_terminal), y)
def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] last_Vs = [traj.extra_info['last_info']['V'] for traj in D] Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As.detach() entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -out['entropy_loss'] out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] last_observations = torch.from_numpy( np.concatenate([traj.last_observation for traj in D], 0)).float() with torch.no_grad(): last_Vs = self.V_head( self.feature_network(last_observations.to( self.device))).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: torch.from_numpy(np.concatenate(x).copy()).to( self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) dataset = Dataset(D, logprobs, entropies, Vs, Qs, As) dataloader = DataLoader(dataset, self.config['train.batch_size'], shuffle=True) for epoch in range(self.config['train.num_epochs']): logs = [self.learn_one_update(data) for data in dataloader] self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = np.mean([item['loss'] for item in logs]) out['grad_norm'] = np.mean([item['grad_norm'] for item in logs]) out['policy_loss'] = np.mean([item['policy_loss'] for item in logs]) out['entropy_loss'] = np.mean([item['entropy_loss'] for item in logs]) out['policy_entropy'] = np.mean( [item['policy_entropy'] for item in logs]) out['value_loss'] = np.mean([item['value_loss'] for item in logs]) out['explained_variance'] = np.mean( [item['explained_variance'] for item in logs]) out['approx_kl'] = np.mean([item['approx_kl'] for item in logs]) out['clip_frac'] = np.mean([item['clip_frac'] for item in logs]) return out
def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] with torch.no_grad(): last_observations = tensorify([traj[-1].observation for traj in D], self.device) last_Vs = self.value(last_observations).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) dataset = Dataset(D, logprobs, entropies, Vs, Qs, As) dataloader = DataLoader(dataset, self.config['train.batch_size'], shuffle=True) for epoch in range(self.config['train.num_epochs']): logs = [self.learn_one_update(data) for data in dataloader] self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.policy_lr_scheduler.get_lr() out['policy_grad_norm'] = np.mean( [item['policy_grad_norm'] for item in logs]) out['value_grad_norm'] = np.mean( [item['value_grad_norm'] for item in logs]) out['policy_loss'] = np.mean([item['policy_loss'] for item in logs]) out['policy_entropy'] = np.mean( [item['policy_entropy'] for item in logs]) out['value_loss'] = np.mean([item['value_loss'] for item in logs]) out['explained_variance'] = np.mean( [item['explained_variance'] for item in logs]) out['approx_kl'] = np.mean([item['approx_kl'] for item in logs]) out['clip_frac'] = np.mean([item['clip_frac'] for item in logs]) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out