def compute_grad(self, episodes): ng_grads = [] for train_episodes, valid_episodes in episodes: params_adapt = self.adapt_first_order(train_episodes) # self.baseline.fit(valid_episodes) loss = self.inner_loss(valid_episodes, params=params_adapt) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) self.baseline.fit(train_episodes) loss = self.inner_loss(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_ng_grad_0 = torch.dot(grad, ng_grad_0) ng_grad_1 = torch.autograd.grad(grad_ng_grad_0, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) ng_grad = ng_grad_0 - 0.1 * ng_grad_1 ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
def learn(self): self.sample_batch() # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0) imp_fac = self.compute_imp_fac() self.estimate_value() self.A = (self.A - self.A.mean()) / (self.A.std() + 1e-8) self.loss = -(imp_fac * self.A ).mean() - self.entropy_weight * self.compute_entropy() if self.value_type is not None: # update value for i in range(self.iters_v): self.update_value() self.policy.zero_grad() loss_grad = torch.autograd.grad(self.loss, self.policy.parameters(), create_graph=True) # loss_grad_vector is a 1-D Variable including all parameters in self.policy loss_grad_vector = parameters_to_vector([grad for grad in loss_grad]) # solve Ax = -g, A is Hessian Matrix of KL divergence trpo_grad_direc = self.conjunction_gradient(-loss_grad_vector) shs = .5 * torch.sum( trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc)) beta = torch.sqrt(self.max_kl / shs) fullstep = trpo_grad_direc * beta gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc) theta = self.linear_search( parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir * beta) # update policy vector_to_parameters(theta, self.policy.parameters()) self.learn_step_counter += 1 self.cur_kl = self.mean_kl_divergence().item() self.policy_ent = self.compute_entropy().item()
def adapt(self, episodes, first_order=False): """Adapt the parameters of the policy network to a new task, from sampled trajectories `episodes`, with a one-step gradient update [1]. """ # Fit the baseline to the training episodes self.baseline.fit(episodes) params = None info = AttrDict() loss = self.inner_loss(episodes, params) info.pre_update_loss = loss.detach().cpu().numpy() for _ in range(self.inner_steps): # Get the new parameters after a one-step gradient update params = self.policy.update_params( loss, step_size=self.fast_lr, first_order=first_order, params=params ) # Get the loss on the training episodes loss = self.inner_loss(episodes, params) info.post_update_loss = loss.detach().cpu().numpy() info.weight_change = torch.norm( parameters_to_vector(self.policy.parameters()) - parameters_to_vector(params.values()) ).detach().cpu().numpy() return params, info
def _fisher_vector_product(self, vector_p_with_state_batch): """ b=Hx 에서 x를 구하려면 H의 역행렬을 알아야 한다. 근데 H의 역행렬 구하기 어렵다. 그래서 피셔 벡터곱으로 Hx를 대충 추정해서 넘겨주자. D_KL ∇D_KL (∇D_KL)^T * x ∇((∇D_KL)^T * x) Hx = ∇((∇D_KL(새로운θ|옛날θ))^T * x) """ (p, s_batch) = vector_p_with_state_batch p.detach() # 왜 같은게 들어가냐면 현재 policy에 대한 다이버전스를 구하는 거라서 kl = kl_divergence(new_actor=self.actor, old_actor=self.actor, s_batch=s_batch) kl = kl.mean() kl_grad = autograd.grad(kl, self.actor.parameters(), create_graph=True) kl_grad = parameters_to_vector(kl_grad) # check kl_grad == 0 kl_grad_p = (kl_grad * p).sum() kl_hessian_p = autograd.grad(kl_grad_p, self.actor.parameters()) kl_hessian_p = parameters_to_vector(kl_hessian_p) return kl_hessian_p + self.damping_coeff * p
def optim_value_lbfgs(self, V_target, inds): value = self.value value.zero_grad() loss_fn = self.loss_func_v def V_closure(): predicted = value(self.s[inds], other_data=self.other_data[inds] if self.other_data is not None else None).squeeze() loss = loss_fn(predicted, V_target) self.value_loss += loss.item() optimizer.zero_grad() loss.backward() return loss old_params = parameters_to_vector(value.parameters()) for lr in self.lr * .5**np.arange(10): optimizer = optim.LBFGS(self.value.parameters(), lr=lr) optimizer.step(V_closure) current_params = parameters_to_vector(value.parameters()) if any(np.isnan(current_params.data.cpu().numpy())): print("LBFGS optimization diverged. Rolling back update...") vector_to_parameters(old_params, value.parameters()) else: return
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) step = grads / torch.norm(grads) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters())
def compute_ng_gradient_test(self, episodes, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): ng_grads = [] for train_episodes, valid_episodes in episodes: params_adapt, step_size, _ = self.adapt_ng_test(train_episodes) # self.baseline.fit(valid_episodes) loss = self.inner_loss_lvc(valid_episodes, params=params_adapt) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) self.baseline.fit(train_episodes) loss = self.inner_loss_lvc(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_F_inv_grad = torch.dot(grad, ng_grad_0) ng_grad_1 = torch.autograd.grad(grad_F_inv_grad, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) ng_grad = ng_grad_0 - step_size * ng_grad_1 ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
def adapt(self, episodes, first_order=False, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Adapt the parameters of the policy network to a new task, from sampled trajectories `episodes`, with a one-step natural gradient update. """ # Fit the baseline to the training episodes self.baseline.fit(episodes) # Get the loss on the training episodes loss = self.inner_loss(episodes) # Get the new parameters after a one-step natural gradient update grads = torch.autograd.grad(loss, self.policy.parameters()) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = self.hessian_vector_product_ng( episodes, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) step = stepdir.detach() old_params = parameters_to_vector(self.policy.parameters()) step_size = 1.0e-2 params = vector_to_named_parameter_like(old_params - step_size * step, self.policy.named_parameters()) # TODO check if params is a function of self.policy.parameters() return params, step_size, step
def gradient_ascent_step(self): """Makes one update of policy weights""" # get loss loss = self.surrogate_function(write_to_log=True) # calculating gradient self.policy.optimizer.zero_grad() loss.backward(retain_graph=True) policy_gradient = parameters_to_vector([v.grad for v in self.policy.parameters()]).squeeze(0) assert policy_gradient.nonzero().size()[0] > 0, "Policy gradient is 0. Skipping update?.." # Use conjugate gradient algorithm to determine the step direction in theta space step_direction = self.conjugate_gradient(-policy_gradient.cpu().numpy()) # Do line search to determine the stepsize of theta in the direction of step_direction shs = step_direction.dot(self.hessian_vector_product(Tensor(step_direction)).cpu().numpy().T) / 2 lm = np.sqrt(shs / self.config.max_kl) fullstep = step_direction / lm gdotstepdir = -policy_gradient.dot(Tensor(step_direction)).data[0] theta = self.linesearch(parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir / lm) # Update parameters of policy model if any(np.isnan(theta.data.cpu().numpy())): raise Exception("NaN detected. Skipping update...") else: vector_to_parameters(theta, self.policy.parameters()) kl_old_new = self.mean_kl_divergence() self.logger["kl_change"].append(kl_old_new.item())
def learn_htrpo(self): b_t = time.time() self.sample_batch() self.split_episode() # No valid episode is collected if self.n_valid_ep == 0: return self.generate_subgoals() if not self.using_original_data: self.reset_training_data() if self.sampled_goal_num is None or self.sampled_goal_num > 0: self.generate_fake_data() self.data_preprocess() self.other_data = self.goal # Optimize Value Estimator self.estimate_value() if self.value_type is not None: # update value for i in range(self.iters_v): self.update_value() # Optimize Policy # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0) # Likelihood Ratio # self.estimate_value() imp_fac = self.compute_imp_fac() if self.value_type: # old value estimator self.A = self.gamma_discount * self.hratio * self.A else: self.A = self.gamma_discount * self.A # Here mean() and sum() / self.n_traj is equivalent, because there # is only a coefficient between two expressions. This coefficient # will be compensated by the stepsize computation in TRPO. However, # in vanilla PG, there is no compensation, therefore, it needs to # be in the exact form of the euqation in the paper. self.loss = - (imp_fac * self.A).mean() - self.entropy_weight * self.compute_entropy() self.policy.zero_grad() loss_grad = torch.autograd.grad( self.loss, self.policy.parameters(), create_graph=True) # loss_grad_vector is a 1-D Variable including all parameters in self.policy loss_grad_vector = parameters_to_vector([grad for grad in loss_grad]) # solve Ax = -g, A is Hessian Matrix of KL divergence trpo_grad_direc = self.conjunction_gradient(- loss_grad_vector) shs = .5 * torch.sum(trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc)) beta = torch.sqrt(self.max_kl / shs) fullstep = trpo_grad_direc * beta gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc) theta = self.linear_search(parameters_to_vector( self.policy.parameters()), fullstep, gdotstepdir * beta) vector_to_parameters(theta, self.policy.parameters()) self.learn_step_counter += 1 self.cur_kl = self.mean_kl_divergence().item() self.policy_ent = self.compute_entropy().item() self.update_normalizer() print("iteration time: {:.4f}".format(time.time()-b_t))
def step(self, H, step_size=1, closure=None): # literally no idea what this does loss = None if closure is not None: loss = closure() # set parameters params = [p for p in self.param_groups[0]['params']] grads = [p.grad for p in params] # convert parameters to a vector param_vector = parameters_to_vector(params) grad_vector = parameters_to_vector(grads) # apply rotation / contract / expansion soln, _ = torch.solve( grad_vector.unsqueeze(1).unsqueeze(0), H.unsqueeze(0)) scaled_gradient = soln[0].reshape(-1) # add the charactoristic scaling scaling = torch.dot(scaled_gradient, soln.reshape(-1)) scaled_gradient *= step_size * torch.sqrt(self.divergence_limit / (scaling + self.epsilon)) # check that the scaling is ok before updating parameters if scaling > 0.: # update the gradient weights vector_to_parameters(scaled_gradient, grads) # now we can perform the update for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone( d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss
def compute_ng_gradient(self, episodes, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): ng_grads = [] for train_episodes, valid_episodes in episodes: params_adapt, step_size, stepdir = self.adapt_ng( train_episodes, cg_iters=cg_iters, cg_damping=cg_damping) # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta) self.baseline.fit(valid_episodes) loss = self.inner_loss_lvc(valid_episodes, params=params_adapt) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) # compute the inverse of Fisher matrix at x=\theta times $grad with Conjugate Gradient hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_inv_grad = conjugate_gradient(hessian_vector_product, ng_grad_0, cg_iters=cg_iters * 2) if self.verbose: print( torch.norm(hessian_vector_product(F_inv_grad) - ng_grad_0) / torch.norm(ng_grad_0)) # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad # create graph for higher differential self.baseline.fit(train_episodes) loss = self.inner_loss_lvc(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach()) ng_grad_1 = torch.autograd.grad(grad_F_inv_grad, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) # compute $ng_grad_2 = the Jacobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_U = hessian_vector_product(stepdir) ng_grad_2 = torch.autograd.grad( torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters()) ng_grad_2 = parameters_to_vector(ng_grad_2) ng_grad = ng_grad_0 - step_size * (ng_grad_1 - ng_grad_2) ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
def update(self, trajectory: Iterable): """ Updates the current policy given a the trajectory of the policy. :param trajectory: a list of transition frames from the episode. This represents the trajectory of the episode. :type trajectory: Iterable :return: the loss from this update :rtype: float: """ if (not isinstance(trajectory, Iterable)): raise ValueError("trajectory must be an Iterable.") # Consolidate the state in the trajectory into an array. states = np.array( [np.asarray(transition.state) for transition in trajectory]) ''' Compute the loss as the log-likelihood of the returns. ''' # Calculate the returns. returns = self._calculate_returns(trajectory) # Calculate the values using the baseline approximator. values = torch.Tensor([self._value_fn(state)[0] for state in states]) # Calculate the advantage using the returns and the values. advantages = returns - values # Compute the loss of the trajectory. logits = torch.stack([ self._policy.logit(np.asarray(transition.state), transition.action, detach=False) for transition in trajectory ]).view(-1) loss = (-logits * advantages).mean() ''' Compute the gradient and the natural policy gradient. ''' # Calculate the gradient of the log likelihood loss. gradient = self._compute_gradient(loss) gradient = parameters_to_vector(gradient).detach().numpy() + 1e-5 # Calculate the natural policy gradient. npg = self._compute_npg(gradient, states) ''' Update the policy and the baseline. ''' # The learning rate to apply for the update. alpha = np.sqrt( np.abs(self.delta / (np.dot(gradient.T, npg.detach().numpy()) + 1e-20))) # The amount to change the parameters by. update = alpha * npg # Calculate and set the new parameters of the policy. new_params = parameters_to_vector( self._policy.get_params(False)) - update self._policy.set_params(new_params.detach().numpy()) # Update baseline approximator using the cumulative returns. self._value_fn.update(states, returns.detach().numpy().reshape(-1, 1)) # Return the loss from the update. return loss.item()
def update_params(self, loss, step_size=0.5, first_order=False): grads = torch.autograd.grad( loss, filter(lambda p: p.requires_grad, self.parameters()), create_graph=not first_order, ) return (parameters_to_vector( filter(lambda p: p.requires_grad, self.parameters())) - parameters_to_vector(grads) * step_size)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] # HESSIAN VEC COMPUTATION # vectorize all parameters grad_vec = parameters_to_vector(group['params']) # create noise vector noise = torch.normal(means=torch.zeros_like(grad_vec), std=self.noise_factor) # compute the product grad_product = torch.sum(grad_vec * noise) grad_grad = torch.autograd.grad( grad_product, group['params'], retain_graph=True ) # h_v_p = hessian_vec_product fisher_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad]) hessian_vec_prod = fisher_vec_prod + (self.cg_damping * noise) for p in group['params']: if p.grad is None: continue grad = p.grad d_p = p.grad.clone().data # REST OF SGD STUFF if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) flattened = parameters_to_vector(group['params']) flattened.data.add_(group['lr'], hessian_vec_prod.data) vector_to_parameters(flattened, group['params']) return loss
def optimize(self): self.total_steps += self.steps_per_train if self.total_steps >= self.learning_start: experience_sample = ray.get( self.experience_replay.sample.remote(self.batch_size)) state = torch.cat([ torch.from_numpy(s.state).cuda().unsqueeze(0) for s in experience_sample ]) next_state = torch.cat([ torch.from_numpy(s.next_state).cuda().unsqueeze(0) for s in experience_sample ]) terminal = (torch.tensor([s.terminal for s in experience_sample ]).cuda().unsqueeze(1)) reward = (torch.tensor([s.reward for s in experience_sample ]).cuda().unsqueeze(1)) action = torch.tensor([s.action for s in experience_sample]).cuda() # Train value function target = ( reward + self.gamma * (1 - terminal) * self.target_value_fn( next_state, self.target_policy(next_state))).detach() actual = self.online_value_fn(state, action) value_fn_loss = self.value_fn_criterion(target, actual) value_fn_loss.backward() self.value_fn_opt.step() self.online_policy.zero_grad() self.online_value_fn.zero_grad() # Train policy policy_loss = -self.online_value_fn( state, self.online_policy(state)).mean() policy_loss.backward() self.policy_opt.step() self.online_policy.zero_grad() self.online_value_fn.zero_grad() # Update target networks v_policy = parameters_to_vector(self.online_policy.parameters()) v_policy_targ = parameters_to_vector( self.target_policy.parameters()) new_v_policy_targ = (self.polyak * v_policy_targ + (1 - self.polyak) * v_policy) vector_to_parameters(new_v_policy_targ, self.target_policy.parameters()) v_value_fn = parameters_to_vector( self.online_value_fn.parameters()) v_value_fn_targ = parameters_to_vector( self.target_value_fn.parameters()) new_v_value_fn_targ = (self.polyak * v_value_fn_targ + (1 - self.polyak) * v_value_fn) vector_to_parameters(new_v_value_fn_targ, self.target_value_fn.parameters())
def _product(vector): kl = self.kl_divergence(episodes, old_pis=None) grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True) flat_grad_kl = parameters_to_vector(grads) grad_kl_v = torch.dot(flat_grad_kl, vector) grad2s = torch.autograd.grad(grad_kl_v, self.policy.parameters()) flat_grad2_kl = parameters_to_vector(grad2s) return flat_grad2_kl + damping * vector
def learn(self, env, max_iter, batch_size): for i_iter in xrange(max_iter): s = env.reset() self._noise_generator.reset() done = False add_noise = i_iter * 1.0 / max_iter < self.explore_fraction e_reward = 0 while not done: # env.render() noise = torch.FloatTensor( self._noise_generator.generate()) if add_noise else None a = self.act(s, noise=noise) s_, r, done, info = env.step(a) self._replay_module.add(tuple((s, a, [r], s_, [int(done)]))) s = s_ e_reward += r if len(self._replay_module) < self.warmup_size: continue # sample batch transitions b_s, b_a, b_r, b_s_, b_d = self._replay_module.sample( batch_size) b_s = numpy.vstack(b_s) b_a = numpy.vstack(b_a) b_s, b_a, b_r, b_d = map( lambda ryo: Variable(torch.FloatTensor(ryo)), [b_s, b_a, b_r, b_d]) b_s_ = Variable(torch.FloatTensor(b_s_), volatile=True) # update critic self._optimizer_critic.zero_grad() y = b_r + self.reward_gamma * self._target_critic( b_s_, self._target_actor(b_s_)) * (1 - b_d) loss = self.loss(self._critic(b_s, b_a), y) loss.backward() self._optimizer_critic.step() # update actor self._optimizer_actor.zero_grad() loss = -self._critic( b_s, self._actor(b_s)).mean() # dpg, eq6 in [1] loss.backward() self._optimizer_actor.step() # update target networks for target, normal in [(self._target_actor, self._actor), (self._target_critic, self._critic)]: target_vec = parameters_to_vector(target.parameters()) normal_vec = parameters_to_vector(normal.parameters()) vector_to_parameters( (1 - self.tau) * target_vec + self.tau * normal_vec, target.parameters()) logger.info('Iter: {}, E_Reward: {}'.format( i_iter, round(e_reward, 2)))
def meta_loss(self, mini_batch, mini_batch_valid): loss = self.loss(mini_batch) params_mdl = self.mdl.update_params(loss['model_loss'], step_size=self.fast_lr,first_order=False) params_kg = self.kg.update_params(loss['model_loss'], step_size=self.fast_lr,first_order=False) old_params_mdl = parameters_to_vector(self.mdl.parameters()) old_params_kg = parameters_to_vector(filter(lambda p: p.requires_grad, self.kg.parameters())) vector_to_parameters(params_mdl, self.mdl.parameters()) vector_to_parameters(params_kg, filter(lambda p: p.requires_grad, self.kg.parameters())) loss1 = self.loss(mini_batch_valid) vector_to_parameters(old_params_mdl, self.mdl.parameters()) vector_to_parameters(old_params_kg, filter(lambda p: p.requires_grad, self.kg.parameters())) return loss1
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) print('old_loss: ', old_loss) # although old_loss is e-8 magnitude, grads is not very small grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) print('grads: ', grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = self.hessian_vector_product( episodes, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot( stepdir, hessian_vector_product(stepdir)) # dot of 3 matrices, sT.H.s lagrange_multiplier = torch.sqrt(shs / max_kl) '''? neglect difference of pi and old_pi?''' # step is only calculated once with all ratio to be 1. step = stepdir / lagrange_multiplier print('step: ', step) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): # assign values to policy network parameters # step is fixed during line search vector_to_parameters(old_params - step_size * step, self.policy.parameters()) # print('oldpis: ', old_pis) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters())
def _product(vector): kl = self.kl_divergence(episodes) grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True) flat_grad_kl = parameters_to_vector(grads) grad_kl_v = torch.dot(flat_grad_kl, vector) grad2s = torch.autograd.grad(grad_kl_v, self.policy.parameters()) grad2s_copy = [] for item in grad2s: item = item.contiguous(); grad2s_copy.append(item) grad2s = tuple(grad2s_copy) flat_grad2_kl = parameters_to_vector(grad2s) return flat_grad2_kl + damping * vector
def _product(vector): kl = self.kl_divergence(episodes, inner_losses) grads = torch.autograd.grad(kl, self.parameters(), retain_graph=True, create_graph=True) flat_grad_kl = parameters_to_vector(grads) grad_kl_v = torch.dot(flat_grad_kl, vector) grad2s = torch.autograd.grad(grad_kl_v, self.parameters(), retain_graph=True) flat_grad2_kl = parameters_to_vector(grad2s) return flat_grad2_kl + damping * vector
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) if old_loss is None: # nothing needs to be done return grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = self.hessian_vector_product( episodes, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss # if the new loss is smaller, and kl divergence is small enough (so the new policy is not too far away) if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters())
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = self.hessian_vector_product( episodes, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 2.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): # if improve.item() < 0.0: print("New Actor surrogate_loss: ", loss) break step_size *= ls_backtrack_ratio else: print("same actor~~~~") vector_to_parameters(old_params, self.policy.parameters()) if self.policy.paramsFlag == OrderedDict( self.policy.named_parameters()): print("really same~~~~~~~~")
def optimize(self): # Return if no completed episodes if len(self.buffers['completed_rewards']) == 0: return # Convert all buffers to tensors num_batch_steps = len(self.buffers['completed_rewards']) rewards = torch.tensor(self.buffers['completed_rewards']) actions = torch.stack(self.buffers['actions'][:num_batch_steps]) states = torch.stack(self.buffers['states'][:num_batch_steps]) log_probs = torch.stack(self.buffers['log_probs'][:num_batch_steps]) rewards, actions, states, log_probs = (rewards.to(self.device), actions.to(self.device), states.to(self.device), log_probs.to(self.device)) # Normalize rewards over episodes rewards = (rewards - rewards.mean()) / rewards.std() # Save current parameters self.optim.zero_grad() old_policy_param = parameters_to_vector( [param for param in self.policy.parameters()]).detach().clone() old_std_param = self.logstd.detach().clone() # Compute regular gradient and step (-log_probs * rewards.view(-1, 1)).mean().backward() self.optim.step() # Find search direction by Adam new_policy_param = parameters_to_vector( [param for param in self.policy.parameters()]).detach() policy_gradients = new_policy_param - old_policy_param std_gradients = self.logstd.detach() - old_std_param # Restore old policy vector_to_parameters(old_policy_param, self.policy.parameters()) with torch.no_grad(): self.logstd[:] = old_std_param # Find new policy and std with line search using Adam gradient self.line_search(policy_gradients, std_gradients, states, actions, log_probs, rewards) # Update buffers removing processed steps for key, storage in self.buffers.items(): if key != 'episode_reward': del storage[:num_batch_steps]
def compute_preconditioner(self, policy, state_tensor, action_tensor, current_loss): # """ CHECK IF THE CURRENT LOSS USES A REPLAY BUFFER """ # if current_loss.include_buffer and current_loss.buffer_init: # # if so, add in the buffer states to the precompute stuff # state_tensor = torch.cat([state_tensor.float(), current_loss.replay_buffer.buffer_states]) # action_tensor = torch.cat([action_tensor.float(), current_loss.replay_buffer.buffer_actions]) """ CONVERT FORMAT """ flat_states = torch.flatten(state_tensor, start_dim=0, end_dim=1) flat_actions = torch.flatten(action_tensor, start_dim=0, end_dim=1) """ COMPUTE FIRST STEP """ # create copy policy_copy = copy.deepcopy(policy) # evaluate loss score = policy_copy(flat_states[0, :], flat_actions[0, :]) # step score.backward() # get gradients grad_i = parameters_to_vector( [p.grad for p in policy_copy.parameters()]) # take outer product H = torch.ger(grad_i, grad_i) # delete copy del policy_copy """ STEP THROUGH DATA AND BUILD FISHER INFO """ for i in range(1, action_tensor.size()[0]): # create copy policy_copy = copy.deepcopy(policy) # zero the parameter gradients policy_copy.zero_grad() # evaluate loss score = policy_copy(flat_states[i, :], flat_actions[i, :]) # step score.backward() # get gradients grad_i = parameters_to_vector( [p.grad for p in policy_copy.parameters()]) # take outer product H = torch.ger(grad_i, grad_i) # delete copy del policy_copy """ STABALIZE FOR USE LATER """ preconditioner = H / action_tensor.size()[0] preconditioner += torch.tensor(1e-4) * torch.eye( preconditioner.size()[0]) """ RETURN THE PRECONDITIONED MATRIX """ return preconditioner
def line_search(self, gradients, states, actions, log_probs, rewards): step_size = (2 * self.kl_delta / gradients.dot( self.fisher_vector_direct(gradients, states))).sqrt() step_size_decay = 1.5 line_search_attempts = 10 # New policy current_parameters = parameters_to_vector(self.policy.parameters()) new_policy = deepcopy(self.policy) vector_to_parameters(current_parameters + step_size * gradients, new_policy.parameters()) new_std = self.logstd.detach() + step_size * self.logstd.grad # Shrink gradient until KL constraint met and improvement for attempt in range(line_search_attempts): # Obtain kl divergence and objective with torch.no_grad(): kl_value = self.kl(new_policy, new_std, states) objective = self.surrogate_objective(new_policy, new_std, states, actions, log_probs, rewards) # Shrink gradient if KL constraint not met or reward lower if kl_value > self.kl_delta or objective < 0: step_size /= step_size_decay vector_to_parameters( current_parameters + step_size * gradients, new_policy.parameters()) new_std = self.logstd.detach() + step_size * self.logstd.grad # Return new policy and std if KL and reward met else: return new_policy, new_std.requires_grad_() # Return old policy and std if constraints never met return self.policy, self.logstd
def train_mt(params): env_fun, iters, animate, camera, model = params env = env_fun(animate=False, camera=camera) obs_dim, act_dim = env.obs_dim, env.act_dim policy = NN(obs_dim, act_dim).float() w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, 0.5) print( "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..." .format("Ant_reach", act_dim, obs_dim, len(w))) sims = [mujoco_py.MjSim(model) for _ in range(es.popsize)] policies = [policy] * es.popsize ctr = 0 try: while not es.stop(): ctr += 1 if ctr > iters: break if ctr % 1000 == 0: sdir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "agents/{}.p".format(env_fun.__name__)) vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy, sdir) print("Saved checkpoint") X = es.ask() output = mp.Queue() processes = [] for i, ef, sim, policy, x in zip(range(es.popsize), [env_fun] * es.popsize, sims, policies, X): processes.append( mp.Process(target=f_mp, args=(i, ef, sim, policy, x, output))) # Run processes for p in processes: p.start() # Exit the completed processes for p in processes: p.join() evals = [output.get() for _ in processes] evals.sort(key=lambda x: x[0]) evals = [ev[1] for ev in evals] es.tell(X, evals) es.disp() except KeyboardInterrupt: print("User interrupted process.") return es.result.fbest
def __init__(self, policy_params, trained_weights=None): Policy.__init__(self, policy_params) self.net = MLP_probs(self.ob_dim, self.ac_dim) #lin_policy = np.load('/home/harshit/work/ARS/trained_policies/Policy_Testerbi2/bi_policy_num_plus149.npz') #lin_policy = lin_policy.items()[0][1] #self.weights=None self.weights = parameters_to_vector( self.net.parameters()).detach().double().numpy() if trained_weights is not None: #print("hieohrfoiahfoidanfkjahdfj") self.net.load_state_dict(torch.load(trained_weights)) #vector_to_parameters(torch.tensor(trained_weights), self.net.parameters()) self.weights = parameters_to_vector( self.net.parameters()).detach().double().numpy()
def train(params): env_fun, iters, animate, camera, _ = params env = env_fun(animate=animate, camera=camera) obs_dim, act_dim = env.obs_dim, env.act_dim policy = NN(obs_dim, act_dim).float() w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, 0.5) f = f_wrapper(env, policy, animate) print( "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..." .format(env_fun.__name__, act_dim, obs_dim, len(w))) it = 0 try: while not es.stop(): it += 1 if it > iters: break if it % 1000 == 0: sdir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "agents/{}.p".format(env_fun.__name__)) vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy, sdir) print("Saved checkpoint") X = es.ask() es.tell(X, [f(x) for x in X]) es.disp() except KeyboardInterrupt: print("User interrupted process.") return es.result.fbest