def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] last_Vs = [traj.extra_info['last_info']['V'] for traj in D] Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As.detach() entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -out['entropy_loss'] out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def td0_target(gamma, rewards, Vs, last_V, reach_terminal): r"""Calculate TD(0) targets of a batch of episodic transitions. Let :math:`r_1, r_2, \dots, r_T` be a list of rewards and let :math:`V(s_0), V(s_1), \dots, V(s_{T-1}), V(s_{T})` be a list of state values including a last state value. Let :math:`\gamma` be a discounted factor, the TD(0) targets are calculated as follows .. math:: r_t + \gamma V(s_t), \forall t = 1, 2, \dots, T .. note:: The state values for terminal states are masked out as zero ! """ rewards = numpify(rewards, np.float32) Vs = numpify(Vs, np.float32) last_V = numpify(last_V, np.float32) if reach_terminal: Vs = np.append(Vs, 0.0) else: Vs = np.append(Vs, last_V) out = rewards + gamma * Vs[1:] return out.astype(np.float32)
def test_vtrace(gamma, last_V, reach_terminal, clip_rho, clip_pg_rho): behavior_logprobs = [1, 2, 3] target_logprobs = [4, 5, 6] Rs = [7, 8, 9] Vs = [10, 11, 12] vs_test, As_test = vtrace(behavior_logprobs, target_logprobs, gamma, Rs, Vs, last_V, reach_terminal, clip_rho, clip_pg_rho) # ground truth calculation behavior_logprobs = numpify(behavior_logprobs, np.float32) target_logprobs = numpify(target_logprobs, np.float32) Rs = numpify(Rs, np.float32) Vs = numpify(Vs, np.float32) last_V = numpify(last_V, np.float32) rhos = np.exp(target_logprobs - behavior_logprobs) clipped_rhos = np.minimum(clip_rho, rhos) cs = np.minimum(1.0, rhos) deltas = clipped_rhos * td0_error(gamma, Rs, Vs, last_V, reach_terminal) vs = np.array([ Vs[0] + gamma**0 * 1 * deltas[0] + gamma * cs[0] * deltas[1] + gamma**2 * cs[0] * cs[1] * deltas[2], Vs[1] + gamma**0 * 1 * deltas[1] + gamma * cs[1] * deltas[2], Vs[2] + gamma**0 * 1 * deltas[2] ]) vs_next = np.append(vs[1:], (1. - reach_terminal) * last_V) clipped_pg_rhos = np.minimum(clip_pg_rho, rhos) As = clipped_pg_rhos * (Rs + gamma * vs_next - Vs) assert np.allclose(vs, vs_test) assert np.allclose(As, As_test)
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) with torch.no_grad(): if kwargs['mode'] == 'train': action = numpify(self.actor(obs).sample(), 'float') elif kwargs['mode'] == 'eval': action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float') out = {} out['raw_action'] = action.squeeze(0) return out
def learn_one_update(self, data): data = [d.detach().to(self.device) for d in data] observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data action_dist = self.policy(observations) logprobs = action_dist.log_prob(old_actions).squeeze() entropies = action_dist.entropy().squeeze() Vs = self.value(observations).squeeze() assert all([x.ndim == 1 for x in [logprobs, entropies, Vs]]) ratio = torch.exp(logprobs - old_logprobs) eps = self.config['agent.clip_range'] policy_loss = -torch.min( ratio * old_As, torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As) policy_loss = policy_loss.mean(0) self.policy_optimizer.zero_grad() policy_loss.backward() policy_grad_norm = nn.utils.clip_grad_norm_( self.policy.parameters(), self.config['agent.max_grad_norm']) self.policy_optimizer.step() if self.config['agent.use_lr_scheduler']: self.policy_lr_scheduler.step(self.total_timestep) clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps) value_loss = torch.max( F.mse_loss(Vs, old_Qs, reduction='none'), F.mse_loss(clipped_Vs, old_Qs, reduction='none')) value_loss = value_loss.mean(0) self.value_optimizer.zero_grad() value_loss.backward() value_grad_norm = nn.utils.clip_grad_norm_( self.value.parameters(), self.config['agent.max_grad_norm']) self.value_optimizer.step() out = {} out['policy_grad_norm'] = policy_grad_norm out['value_grad_norm'] = value_grad_norm out['policy_loss'] = policy_loss.item() out['policy_entropy'] = entropies.mean().item() out['value_loss'] = value_loss.item() out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float')) out['approx_kl'] = (old_logprobs - logprobs).mean(0).item() out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean(0).item() return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} if kwargs['mode'] == 'train': dist = self.actor(obs) action = dist.rsample() out['action'] = action out['action_logprob'] = dist.log_prob(action) elif kwargs['mode'] == 'stochastic': with torch.no_grad(): out['action'] = numpify(self.actor(obs).sample(), 'float') elif kwargs['mode'] == 'eval': with torch.no_grad(): out['action'] = numpify( torch.tanh(self.actor.mean_forward(obs)), 'float') else: raise NotImplementedError return out
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) features = self.feature_network(obs) action_dist = self.action_head(features) action = action_dist.sample() out = {} out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) return out
def test_numpify(): # tensor x = torch.tensor(2.43) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = torch.randn(10) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = torch.randn(10, 20, 30) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y # ndarray x = np.array(2.43) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = np.random.randn(10) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = np.random.randn(10, 20, 30) y = numpify(x, np.float32) assert np.allclose(x, y) del x, y # raw list x = [2.43] y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = [1, 2, 3, 4, 5, 6] y = numpify(x, np.float32) assert np.allclose(x, y) del x, y x = [[1, 2], [3, 4], [5, 6]] y = numpify(x, np.float32) assert np.allclose(x, y) del x, y
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) with torch.no_grad(): action = numpify(self.actor(obs), 'float') if kwargs['mode'] == 'train': eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape) action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high) out = {} out['action'] = action return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} features = self.feature_network(obs) action_dist = self.action_head(features) out['entropy'] = action_dist.entropy() action = action_dist.sample() out['raw_action'] = numpify(action, 'float') return out
def learn(self, D, **kwargs): replay = kwargs['replay'] episode_length = kwargs['episode_length'] out = {} out['actor_loss'] = [] out['critic_loss'] = [] Q_vals = [] for i in range(episode_length): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs = self.critic(observations, actions).squeeze() with torch.no_grad(): next_Qs = self.critic_target( next_observations, self.actor_target(next_observations)).squeeze() targets = rewards + self.config[ 'agent.gamma'] * masks * next_Qs.detach() critic_loss = F.mse_loss(Qs, targets) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() actor_loss = -self.critic(observations, self.actor(observations)).mean() self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() self.polyak_update_target() out['actor_loss'].append(actor_loss) out['critic_loss'].append(critic_loss) Q_vals.append(Qs) out['actor_loss'] = torch.stack(out['actor_loss']).mean().item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.stack(out['critic_loss']).mean().item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q'] = describe_it(Q_vals) return out
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) features = self.feature_network(obs) action_dist = self.action_head(features) V = self.V_head(features) action = action_dist.sample() out = {} out['action_dist'] = action_dist out['V'] = V out['entropy'] = action_dist.entropy() out['action'] = action out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) out['action_logprob'] = action_dist.log_prob(action.detach()) return out
def vtrace(behavior_logprobs, target_logprobs, gamma, Rs, Vs, last_V, reach_terminal, clip_rho=1.0, clip_pg_rho=1.0): behavior_logprobs = numpify(behavior_logprobs, np.float32) target_logprobs = numpify(target_logprobs, np.float32) Rs = numpify(Rs, np.float32) Vs = numpify(Vs, np.float32) last_V = numpify(last_V, np.float32) assert all([ item.ndim == 1 for item in [behavior_logprobs, target_logprobs, Rs, Vs] ]) assert np.isscalar(gamma) rhos = np.exp(target_logprobs - behavior_logprobs) clipped_rhos = np.minimum(clip_rho, rhos) cs = np.minimum(1.0, rhos) deltas = clipped_rhos * td0_error(gamma, Rs, Vs, last_V, reach_terminal) vs_minus_V = [] total = 0.0 for delta_t, c_t in zip(deltas[::-1], cs[::-1]): total = delta_t + gamma * c_t * total vs_minus_V.append(total) vs_minus_V = np.asarray(vs_minus_V)[::-1] vs = vs_minus_V + Vs vs_next = np.append(vs[1:], (1. - reach_terminal) * last_V) clipped_pg_rhos = np.minimum(clip_pg_rho, rhos) As = clipped_pg_rhos * (Rs + gamma * vs_next - Vs) return vs, As
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} action_dist = self.policy(obs) out['action_dist'] = action_dist out['entropy'] = action_dist.entropy() action = action_dist.sample() out['action'] = action out['raw_action'] = numpify(action, 'float') out['action_logprob'] = action_dist.log_prob(action.detach()) V = self.value(obs) out['V'] = V return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} features = self.feature_network(obs) action_dist = self.action_head(features) out['action_dist'] = action_dist out['entropy'] = action_dist.entropy() action = action_dist.sample() out['action'] = action out['raw_action'] = numpify(action, 'float') out['action_logprob'] = action_dist.log_prob(action.detach()) V = self.V_head(features) out['V'] = V return out
def choose_action(self, x, **kwargs): if x.first(): self.state = self.reset(1) obs = tensorify(x.observation, self.device).unsqueeze(0) obs = obs.unsqueeze(0) # add seq_dim features, [next_state] = self.feature_network(obs, [self.state]) if 'last_info' not in kwargs: self.state = next_state features = features.squeeze(0) # squeeze seq_dim action_dist = self.action_head(features) V = self.V_head(features) action = action_dist.sample() out = {} out['action_dist'] = action_dist out['V'] = V out['entropy'] = action_dist.entropy() out['action'] = action out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) out['action_logprob'] = action_dist.log_prob(action.detach()) return out
def bootstrapped_returns(gamma, rewards, last_V, reach_terminal): r"""Return (discounted) accumulated returns with bootstrapping for a batch of episodic transitions. Formally, suppose we have all rewards :math:`(r_1, \dots, r_T)`, it computes .. math:: Q_t = r_t + \gamma r_{t+1} + \dots + \gamma^{T - t} r_T + \gamma^{T - t + 1} V(s_{T+1}) .. note:: The state values for terminal states are masked out as zero ! """ last_V = numpify(last_V, np.float32).item() if reach_terminal: out = geometric_cumsum(gamma, np.append(rewards, 0.0)) else: out = geometric_cumsum(gamma, np.append(rewards, last_V)) return out[0, :-1].astype(np.float32)
def learn(self, D, **kwargs): replay = kwargs['replay'] episode_length = kwargs['episode_length'] out = {} out['actor_loss'] = [] out['critic_loss'] = [] out['alpha_loss'] = [] Q1_vals = [] Q2_vals = [] logprob_vals = [] for i in range(episode_length): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) Qs1, Qs2 = map(lambda x: x.squeeze(-1), [Qs1, Qs2]) with torch.no_grad(): out_actor = self.choose_action(next_observations, mode='train') next_actions = out_actor['action'] next_actions_logprob = out_actor['action_logprob'] next_Qs1, next_Qs2 = self.critic_target( next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2).squeeze( -1) - self.alpha.detach() * next_actions_logprob Q_targets = rewards + self.config[ 'agent.gamma'] * masks * next_Qs critic_loss = F.mse_loss(Qs1, Q_targets.detach()) + F.mse_loss( Qs2, Q_targets.detach()) print(critic_loss.item()) ############ self.optimizer_zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() if i % self.config['agent.policy_delay'] == 0: out_actor = self.choose_action(observations, mode='train') policy_actions = out_actor['action'] policy_actions_logprob = out_actor['action_logprob'] actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions) actor_Qs = torch.min(actor_Qs1, actor_Qs2).squeeze(-1) actor_loss = torch.mean(self.alpha.detach() * policy_actions_logprob - actor_Qs) self.optimizer_zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() alpha_loss = torch.mean( self.log_alpha * (-policy_actions_logprob - self.target_entropy).detach()) self.optimizer_zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.polyak_update_target() out['actor_loss'].append(actor_loss) out['alpha_loss'].append(alpha_loss) out['critic_loss'].append(critic_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) logprob_vals.append(policy_actions_logprob) out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) out['logprob'] = describe_it(logprob_vals) out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item() out['alpha'] = self.alpha.item() return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory Ts = [len(traj) for traj in D] behavior_logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] out_agent = self.choose_action( np.concatenate([traj.numpy_observations[:-1] for traj in D], 0)) logprobs = out_agent['action_logprob'].squeeze() entropies = out_agent['entropy'].squeeze() Vs = out_agent['V'].squeeze() with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) vs, As = [], [] for traj, behavior_logprob, logprob, V, last_V in zip( D, behavior_logprobs, logprobs.detach().cpu().split(Ts), Vs.detach().cpu().split(Ts), last_Vs): v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards, V, last_V, traj.reach_terminal, self.clip_rho, self.clip_pg_rho) vs.append(v) As.append(A) # Metrics -> Tensor, device vs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [vs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, vs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(vs, 'float'), y_pred=numpify(Vs, 'float')) return out
def test_numpify(): # Tensor x = torch.randn(5) y = numpify(x) assert isinstance(y, np.ndarray) assert x.shape == y.shape del x, y x = torch.randn(5, 4) y = numpify(x) assert isinstance(y, np.ndarray) assert x.shape == y.shape del x, y x = torch.randn(5, 4) y = numpify(x, dtype=np.float16) assert isinstance(y, np.ndarray) assert x.shape == y.shape assert y.dtype == np.float16 del x, y # Array x = np.random.randn(5) y = numpify(x) assert isinstance(y, np.ndarray) assert x.shape == y.shape del x, y x = np.random.randn(5, 4) y = numpify(x) assert isinstance(y, np.ndarray) assert x.shape == y.shape del x, y x = np.random.randn(5, 4) y = numpify(x, dtype=np.float16) assert isinstance(y, np.ndarray) assert x.shape == y.shape assert y.dtype == np.float16 del x, y # List x = [1, 2, 3, 4] y = numpify(x) assert isinstance(y, np.ndarray) assert np.allclose(x, y) del x, y x = [[1.2, 2.3], [3.4, 4.5]] y = numpify(x) assert isinstance(y, np.ndarray) assert np.allclose(x, y) del x, y # Tuple x = (1, 2, 3, 4) y = numpify(x) assert isinstance(y, np.ndarray) assert np.allclose(x, y) del x, y x = ((1.2, 2.3), (3.4, 4.5)) y = numpify(x) assert isinstance(y, np.ndarray) assert np.allclose(x, y) del x, y # Scalar x = 1 y = numpify(x) assert isinstance(y, np.ndarray) del x, y # Bool x = True y = numpify(x) assert isinstance(y, np.ndarray) del x, y
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn(self, D, **kwargs): replay = kwargs['replay'] T = kwargs['T'] list_actor_loss = [] list_critic_loss = [] Q1_vals = [] Q2_vals = [] for i in range(T): observations, actions, rewards, next_observations, masks = replay.sample( self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) with torch.no_grad(): next_actions = self.actor_target(next_observations) eps = torch.empty_like(next_actions).normal_( 0.0, self.config['agent.target_noise']) eps = eps.clamp(-self.config['agent.target_noise_clip'], self.config['agent.target_noise_clip']) next_actions = torch.clamp(next_actions + eps, -self.max_action, self.max_action) next_Qs1, next_Qs2 = self.critic_target( next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2) targets = rewards + self.config['agent.gamma'] * masks * next_Qs critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss( Qs2, targets.detach()) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_( self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() if i % self.config['agent.policy_delay'] == 0: actor_loss = -self.critic.Q1(observations, self.actor(observations)).mean() self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_( self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() self.polyak_update_target() list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) self.total_timestep += T out = {} out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float'). squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) return out
def learn(self, D, **kwargs): replay = kwargs['replay'] T = kwargs['T'] list_actor_loss = [] list_critic_loss = [] list_alpha_loss = [] Q1_vals = [] Q2_vals = [] logprob_vals = [] for i in range(T): observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size']) Qs1, Qs2 = self.critic(observations, actions) with torch.no_grad(): action_dist = self.actor(next_observations) next_actions = action_dist.rsample() next_actions_logprob = action_dist.log_prob(next_actions).unsqueeze(-1) next_Qs1, next_Qs2 = self.critic_target(next_observations, next_actions) next_Qs = torch.min(next_Qs1, next_Qs2) - self.alpha.detach()*next_actions_logprob targets = rewards + self.config['agent.gamma']*masks*next_Qs critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(Qs2, targets.detach()) self.optimizer_zero_grad() critic_loss.backward() critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm']) self.critic_optimizer.step() action_dist = self.actor(observations) policy_actions = action_dist.rsample() policy_actions_logprob = action_dist.log_prob(policy_actions).unsqueeze(-1) actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions) actor_Qs = torch.min(actor_Qs1, actor_Qs2) actor_loss = torch.mean(self.alpha.detach()*policy_actions_logprob - actor_Qs) self.optimizer_zero_grad() actor_loss.backward() actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm']) self.actor_optimizer.step() alpha_loss = torch.mean(self.log_alpha*(-policy_actions_logprob - self.target_entropy).detach()) self.optimizer_zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.polyak_update_target() list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) list_alpha_loss.append(alpha_loss) Q1_vals.append(Qs1) Q2_vals.append(Qs2) logprob_vals.append(policy_actions_logprob) self.total_timestep += T out = {} out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item() out['actor_grad_norm'] = actor_grad_norm out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item() out['critic_grad_norm'] = critic_grad_norm describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['Q1'] = describe_it(Q1_vals) out['Q2'] = describe_it(Q2_vals) out['logprob'] = describe_it(logprob_vals) out['alpha_loss'] = torch.tensor(list_alpha_loss).mean(0).item() out['alpha'] = self.alpha.item() return out