def loss(self, episodes, inds=None): """ PPO Surrogate Loss """ log_ratios, advantages, values, entropy = self._forward_policy( episodes, ratio=True) # clipped pg loss ratio = torch.exp(log_ratios) pg_loss1 = -advantages * ratio pg_loss2 = -advantages * torch.clamp( ratio, min=1.0 - self.clip_frac, max=1.0 + self.clip_frac) # clipped value loss values_clipped = episodes.old_values + torch.clamp( values.squeeze() - episodes.old_values, min=-self.clip_frac, max=self.clip_frac) vf_loss1 = (values.squeeze() - episodes.returns)**2 vf_loss2 = (values_clipped - episodes.returns)**2 if inds is None: inds = np.arange(self.num_workers) masks = episodes.mask[:, inds] pg_loss = weighted_mean(torch.max(pg_loss1, pg_loss2)[:, inds], dim=0, weights=masks) vf_loss = 0.5 * weighted_mean( torch.max(vf_loss1, vf_loss2)[:, inds], dim=0, weights=masks) entropy_loss = weighted_mean(entropy[:, inds], dim=0, weights=masks) return pg_loss + self.vf_coef * vf_loss - self.ent_coef * entropy_loss
async def surrogate_loss(self, train_futures, valid_futures, old_pi=None, args=None, inner=None): first_order = (old_pi is not None) or self.first_order params = await self.adapt(train_futures, first_order=first_order, args=args, inner=inner) with torch.set_grad_enabled(old_pi is None): valid_episodes = await valid_futures pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) ratio = torch.exp(log_ratio) losses = -weighted_mean(ratio * valid_episodes.advantages, lengths=valid_episodes.lengths) kls = weighted_mean(kl_divergence(pi, old_pi), lengths=valid_episodes.lengths) return losses.mean(), kls.mean(), old_pi
async def surrogate_loss(self, train_futures, valid_futures, old_pi=None): first_order = (old_pi is not None) or self.first_order # 暂停协程函数,等待协程函数 adapt() 运行结束并输出返回值 # 要先在此处暂停函数 params = await self.adapt(train_futures, first_order=first_order) """ 要等到上面的 train_futures 进行完之后,再往下进行 每一个 train_futures 要对应一个 valid_futures,每一对是并行分开运行的 future 的数量就是 每一个 batch 中 tasks 的数量 """ with torch.set_grad_enabled(old_pi is None): # 暂停协程函数,等待协程对象 valid_futures 运行结束并输出返回值 valid_episodes = await valid_futures pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) ratio = torch.exp(log_ratio) losses = -weighted_mean(ratio * valid_episodes.advantages, lengths=valid_episodes.lengths) kls = weighted_mean(kl_divergence(pi, old_pi), lengths=valid_episodes.lengths) return losses.mean(), kls.mean(), old_pi
def loss(self, episodes): """ REINFORCE gradient with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). """ log_probs, advantages, values, entropy = self._forward_policy(episodes) pg_loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask) vf_loss = 0.5 * weighted_mean((values.squeeze() - episodes.returns) ** 2, dim=0, weights=episodes.mask) entropy_loss = weighted_mean(entropy, dim=0, weights=episodes.mask) return pg_loss + self.vf_coef * vf_loss - self.ent_coef * entropy_loss
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): if self.usePPO: params, grad_norm = self.adapt_ppo(train_episodes) else: params = self.adapt(train_episodes) self.logger.info("in surrogate_loss") with torch.set_grad_enabled(old_pi is None): if self.baseline_type == 'critic shared': pi, _ = self.policy(valid_episodes.observations, params=params) pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) if self.baseline_type == 'linear': values = self.baseline(valid_episodes) elif self.baseline_type == 'critic separate': values = self.baseline(valid_episodes.observations) elif self.baseline_type == 'critic shared': _, values = self.policy(valid_episodes.observations, params=params) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): """ Using TRPO. old_pis are not None only when doing line search? How are old_pis used? Like the behavior policy in TRPO? How? """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): # adapt our policy network to a new task params = self.adapt(train_episodes) # doing learning only when old_pi is None? with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # the set of policies adapted to each task pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): losses, kls, action_dists, critic_losses = [], [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): policy_params, critic_params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): action_dist = self.policy(valid_episodes.observations, params=policy_params) action_dists.append(detach_distribution(action_dist)) if old_pi is None: old_pi = detach_distribution(action_dist) values = self.critic(valid_episodes.observations, params=critic_params) advantages = valid_episodes.gae(values, tau=self.tau) value_loss = weighted_mean(advantages.pow(2), dim=0, weights=valid_episodes.mask) critic_losses.append(value_loss) advantages = weighted_normalize(advantages, weights=valid_episodes.mask, epsilon=1e-5) log_ratio = (action_dist.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean(ratio * advantages.detach(), dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(action_dist, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), action_dists, torch.mean(torch.stack(critic_losses, dim=0)))
def kl_divergence(self, episodes, old_pis=None): kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): self.logger.info("in kl divergence") if self.usePPO: params, grad_norm = self.adapt_ppo(train_episodes) else: params = self.adapt(train_episodes) grad_norm = [] #if self.baseline_type = 'critic shared': # pi,_ = self.policy(valid_episodes.obervations,params=params) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) self.logger.info("kl:") self.logger.info(kls) self.logger.info("grad_norm:") self.logger.info(grad_norm) #pdb.set_trace() return torch.mean(torch.stack(kls, dim=0))
def inner_critic_loss(self, episodes, params=None): values = self.critic(episodes.observations) advantages = episodes.gae(values, tau=self.tau) value_loss = weighted_mean(advantages.pow(2), dim=0, weights=episodes.mask) return value_loss
def kl_divergence_ng(self, episodes): # episode is the train episode pi = self.policy(episodes.observations) pi_detach = detach_distribution(pi) mask = episodes.mask if episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi_detach, pi), dim=0, weights=mask) return kl
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(True): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss_clipped = ratio.clamp(1.0 - self.ppo_ratio, 1.0 + self.ppo_ratio) * advantages loss = ratio * advantages loss = -torch.min(loss, loss_clipped) loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(old_pi, pi), dim=0, weights=mask) kls.append(kl) losses.append(loss + kl * 0.0005) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # detach the mu, scale parameters of distribution pi, no gradients, no update to them pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) # initial 0, changed in line search process as pi changed, old_pi not changed log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) # print('log_ratio: ',log_ratio) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # print('ratio: ', ratio) # print('advantages: ', advantages) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) # the weighted_mean loss is very samll, e-8 magnitude print('loss: ', loss) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): """Computes the surrogate loss in TRPO: (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14 Because the meta-loss tried to find theta that minimizes loss with phi, the loss is computed with valid episodes """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # Convert back to ratio from log loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return ( torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def reinforce_loss(policy, episodes, params=None): pi = policy(episodes.observations.view((-1, *episodes.observation_shape)), params=params) log_probs = pi.log_prob(episodes.actions.view((-1, *episodes.action_shape))) log_probs = log_probs.view(len(episodes), episodes.batch_size) losses = -weighted_mean(log_probs * episodes.advantages, lengths=episodes.lengths) return losses.mean()
def val(args, sampler_val, policy, baseline, batch): start_time = time.time() from maml_rl.utils.torch_utils import weighted_normalize, weighted_mean tasks_val = sampler_val.sample_tasks() task_to_episodes = dict() for task in tasks_val: task_episodes = [] sampler_val.reset_task(task) for i_episode in range(args.num_adapt_val + 1): if i_episode == 0: params = None episodes = sampler_val.sample(policy, params=params, gamma=args.gamma, device=args.device) # compute inner loss baseline.fit(episodes) values = baseline(episodes) advantages = episodes.gae(values, tau=args.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) entropy = pi.entropy().mean() loss = -weighted_mean( log_probs * advantages, dim=0, weights=episodes.mask) - args.entropy_coef_val * entropy fast_lr = args.fast_lr if i_episode == 0 else args.fast_lr_val_after_one if i_episode <= args.num_adapt_val: params = policy.update_params(loss, step_size=fast_lr, first_order=True) task_episodes.append(episodes) task_to_episodes[str(task)] = task_episodes for i_episode in range(args.num_adapt_val + 1): returns = calculate_returns([ task_episodes[i_episode].rewards for task_episodes in task_to_episodes.values() ]) logger.logkv(f'val_return_avg_adapt{i_episode}', returns.mean().item()) logger.logkv(f'val_return_std_adapt{i_episode}', returns.std().item()) logger.logkv('val_time', time.time() - start_time) save_dir = os.path.join(args.log_dir, 'val') os.makedirs(save_dir, exist_ok=True) pickle.dump(task_to_episodes, open(os.path.join(save_dir, f'val_{batch}.pkl'), 'wb'))
def test_weighted_mean_side_effect(): lengths = [2, 3, 7, 5, 11] # Inputs inputs_np = np.random.rand(13, 5).astype(np.float32) # Pytorch inputs_th = torch.as_tensor(inputs_np) mean_th = weighted_mean(inputs_th, lengths=lengths) for i, length in enumerate(lengths): assert (inputs_th[length:, i] == 0.).all() assert (inputs_np[length:, i] == 0.).all()
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0) return loss
def kl_divergence(self, episodes, old_pis=None): kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return torch.mean(torch.stack(kls, dim=0))
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). The baseline is subtracted from the empirical return to reduce variance of the optimization. In here, a linear function as the baseline with a time-varying feature vector is used. """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask) return loss
def inner_loss(self, episodes, l_params=None, h_params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) # First we calculate the latent space actions from the higher level policy (stored in episodes). # Then we calculate the lower level actions using the higher level actions pi_higher = self.h_policy(episodes.observations, params=h_params) # Calculate the log probability log_probs = pi_higher.log_prob(episodes.higher_level_actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0) return loss
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). https://pytorch.org/docs/0.3.1/distributions.html (except using advantag instead of rewards.) Implements eq 4. """ vf_loss = -1 loss = 0 if self.baseline_type == 'linear': values = self.baseline(episodes) elif self.baseline_type == 'critic separate': values = self.baseline(episodes.observations) # find value loss sum [(R-V(s))^2] R = episodes.returns.view([200, 20, 1]) vf_loss = (((values - R)**2).mean())**(1 / 2) #else: # pi,values = self.policy(episodes.observations) # pi,vi = self.policy(episodes.observations,params=params) # log_probs = pi.log_prob(values.size()) # loss = (((values - R) ** 2).mean()) ** (1 / 2) advantages = episodes.gae(values, tau=self.tau) advantages_unnorm = advantages sum_adv = torch.sum(advantages_unnorm).numpy() logging.info("unnormalized advantages: " + str(sum_adv)) logging.info("sum of returns:" + str(torch.sum(episodes.returns))) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: # sum over all the workers. log_probs = torch.sum(log_probs, dim=2) loss = loss - weighted_mean( log_probs * advantages, dim=0, weights=episodes.mask) logging.info("inner loss: " + str(loss)) return loss, vf_loss
def test_weighted_mean_no_dim(): lengths = [2, 3, 5, 7, 11] # Inputs inputs_np = np.random.rand(5, 13).astype(np.float32) weights_np = np.zeros((5, 13), dtype=np.float32) for i, length in enumerate(lengths): inputs_np[i, length:] = 0. weights_np[i, :length] = 1. # Pytorch inputs_th = torch.from_numpy(inputs_np) weights_th = torch.from_numpy(weights_np) mean_th = weighted_mean(inputs_th, dim=None, weights=weights_th) # Numpy sum_np, num_np = 0., 0. for i in range(5): for j in range(13): sum_np += inputs_np[i, j] * weights_np[i, j] num_np += weights_np[i, j] mean_np = sum_np / num_np assert mean_th.dim() == 0 assert np.allclose(mean_th.item(), mean_np)
def reinforce_loss(policy, episodes, init_std=1.0, min_std=1e-6, output_size=2 ): output = policy(episodes.observations.view((-1, *episodes.observation_shape))) min_log_std = math.log(min_std) sigma = nn.Parameter(torch.Tensor(output_size)) sigma.data.fill_(math.log(init_std)) scale = torch.exp(torch.clamp(sigma, min=min_log_std)) pi = Independent(Normal(loc=output, scale=scale), 1) log_probs = pi.log_prob(episodes.actions.view((-1, *episodes.action_shape))) log_probs = log_probs.view(len(episodes), episodes.batch_size) losses = -weighted_mean(log_probs * episodes.advantages, lengths=episodes.lengths) return losses.mean()
def kl_divergence(self, episodes, old_pis=None): """In Trust Region Policy Optimization (TRPO, [4]), the heuristic approximation which considers the "average" KL divergence is used instead """ kls = [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) pi = self.policy(valid_episodes.observations, params=params) if old_pi is None: old_pi = detach_distribution(pi) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return torch.mean(torch.stack(kls, dim=0))
def test_weighted_mean_multi_dimensional(): lengths = [2, 3, 7, 5, 11] # Inputs inputs_np = np.random.rand(13, 5, 17, 19).astype(np.float32) for i, length in enumerate(lengths): inputs_np[length:, i] = 0. # Pytorch inputs_th = torch.as_tensor(inputs_np) mean_th = weighted_mean(inputs_th, lengths=lengths) # Numpy mean_np = np.zeros((5, 17, 19), dtype=np.float32) for i, length in enumerate(lengths): for j in range(13): if j < length: mean_np[i] += inputs_np[j, i] mean_np[i] /= length assert mean_th.dim() == 3 assert mean_th.shape == (5, 17, 19) np.testing.assert_allclose(mean_th.detach().numpy(), mean_np)
def compute_ng_gradient(self, episodes, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): ng_grads = [] for train_episodes, valid_episodes in episodes: params, step_size, step = self.adapt(train_episodes) # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta) pi = self.policy(valid_episodes.observations, params=params) pi_detach = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = pi.log_prob( valid_episodes.actions) - pi_detach.log_prob( valid_episodes.actions) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_inv_grad = conjugate_gradient(hessian_vector_product, ng_grad_0, cg_iters=cg_iters) # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad # create graph for higher differential # self.baseline.fit(train_episodes) loss = self.inner_loss(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach()) ng_grad_1 = torch.autograd.grad(grad_F_inv_grad, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_U = hessian_vector_product(step) ng_grad_2 = torch.autograd.grad( torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters()) ng_grad_2 = parameters_to_vector(ng_grad_2) ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2) ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
def surrogate_loss(self, episodes, old_pis=None): """ Surrogate objective: E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi) V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks] :param old_pis: dummy parameter derived from super :return: mean of losses, mean of kls, pis """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for episode_index in range(len(episodes)): (train_episodes, valid_episodes) = episodes[episode_index] old_pi = old_pis[episode_index] if self.current_policy_idx == 0: dominance_correction = 1 else: difference_from_best_value = total_rewards( valid_episodes.rewards ) - self.values_of_optimized_policies[episode_index] dominance_correction = 1 - 1 / ( 1 + math.exp(difference_from_best_value)) params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -dominance_correction * weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) if len(losses) == 0 or len(kls) == 0: # signal outside that no losses. avoiding taking mean of empty tensors.. return (None, None, pis) else: return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def inner_loss_ppo_noUpdate(self, episodes, first_order, params=None, ent_coef=0, vf_coef=0, nenvs=1): """Compute the inner loss for the one-step gradient update. The inner loss is PPO with clipped ratio = new_pi/old_pi. Can make cliprange adaptable. nenvs = number of workers. nsteps defined in env """ #episodes = [num of steps, num of episodes, obs_space] #NEED TO CHANGE ADVANTAGE CALCULATION TO CRITIC. losses = [] self.logger.info("cliprange: " + str(self.cliprange) + "; noptepochs: " + str(self.noptepochs) + ";nminibaches: " + str(self.nminibatches) + ";ppo_lr: " + str(self.ppo_lr)) # Save the old parameters old_policy = copy.deepcopy(self.policy) old_params = parameters_to_vector(old_policy.parameters()) #Need to take mini-batch of sampled examples to do gradient update a few times. nepisodes = episodes.observations.shape[1] nsteps = episodes.observations.shape[0] nbatch = nenvs * nsteps * nepisodes nbatch_train = nbatch // self.nminibatches mblossvals = [] #Flattern the episode to [steps, observations] episodes_flat = BatchEpisodes(batch_size=nbatch) i = 0 for ep in range(nepisodes): for step in range(nsteps): episodes_flat.append([episodes.observations[step][ep].numpy()], [episodes.actions[step][ep].numpy()], [episodes.returns[step][ep].numpy()], (i, )) i += 1 inds = np.arange(nbatch) # For the case with linear baseline. vf_loss = -1 for epoch in range(self.noptepochs): # Randomize the indexes #np.random.shuffle(inds) mb_vf_loss = torch.zeros(1) grad_norm = [] # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): mb_obs, mb_returns, mb_masks, mb_actions = [], [], [], [] mb_episodes = BatchEpisodes(batch_size=nbatch_train) end = start + nbatch_train mbinds = inds[start:end] for i in range(len(mbinds)): mb_obs.append( episodes_flat.observations[0][mbinds[i]].numpy()) mb_returns.append( episodes_flat.returns[0][mbinds[i]].numpy()) mb_masks.append(episodes_flat.mask[0][mbinds[i]].numpy()) mb_actions.append( episodes_flat.actions[0][mbinds[i]].numpy()) mb_episodes.append([mb_obs[i]], [mb_actions[i]], [mb_returns[i]], (i, )) if self.baseline_type == 'linear': values = self.baseline(mb_episodes) elif self.baseline_type == 'critic separate': values = self.baseline(mb_episodes.observations) # find value loss sum [(R-V(s))^2] R = torch.FloatTensor(np.array(mb_returns)) mb_vf_loss = (((values - R)**2).mean()) + mb_vf_loss #values = self.baseline(mb_episodes) advantages = mb_episodes.gae(values, tau=self.tau) advantages_unnorm = advantages advantages = weighted_normalize(advantages.type(torch.float32), weights=torch.ones( 1, advantages.shape[1])) mb_returns_sum = np.sum(mb_returns) self.logger.info("iter: " + "epoch:" + str(epoch) + "; mb:" + str(start / nbatch_train)) self.logger.info("mb returns: " + str(mb_returns_sum)) pi = self.policy(mb_episodes.observations) log_probs = pi.log_prob(mb_episodes.actions) #reload old policy. vector_to_parameters(old_params, old_policy.parameters()) pi_old = old_policy(mb_episodes.observations) log_probs_old = pi_old.log_prob(mb_episodes.actions) if log_probs.dim() > 2: log_probs_old = torch.sum(log_probs_old, dim=2) log_probs = torch.sum(log_probs, dim=2) ratio = torch.exp(log_probs - log_probs_old) self.logger.info("max pi: ") self.logger.info(torch.max(pi.mean)) for x in ratio[0][:10]: if x > 1E5 or x < 1E-5: #pdb.set_trace() self.logger.info("ratio too large or too small.") self.logger.info(ratio[0][:10]) self.logger.info("policy ratio: ") self.logger.info(ratio[0][:10]) #loss function pg_losses = -advantages * ratio pg_losses2 = -advantages * torch.clamp( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) # Final PG loss pg_loss = weighted_mean(torch.max(pg_losses, pg_losses2), weights=torch.ones( 1, advantages.shape[1])) self.logger.debug("policy mu weights: ") self.logger.debug(self.policy.mu.weight) sum_adv = torch.sum(advantages_unnorm).numpy() self.logger.info("unnormalized advantages: " + str(sum_adv)) # Total loss loss = pg_loss self.logger.info("max_action: " + str(np.max(mb_actions))) self.logger.info("max_action index: " + str(np.argmax(mb_actions))) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) losses.append(loss) self.logger.info("inner loss for each mb and epoch: ") self.logger.info(mblossvals) return torch.mean(torch.stack(losses, dim=0))