def f(w): reward_total = 0 reps = 1 vector_to_parameters(torch.from_numpy(w).float(), policy.parameters()) for i in range(reps): reward = 0 done = False obs = env.reset() h_0 = policy.init_hidden() while not done: # Get action from policy with torch.no_grad(): act, h_1 = policy((my_utils.to_tensor(obs, True), h_0)) # Step environment act = act.squeeze(0).numpy() #act = np.array([-1,0]) obs, rew, done, _ = env.step(act) if animate: env.render() reward += rew h_0 = h_1 reward_total += reward return - (reward_total) / reps
def gradient_ascent_step(self): """Makes one update of policy weights""" # get loss loss = self.surrogate_function(write_to_log=True) # calculating gradient self.policy.optimizer.zero_grad() loss.backward(retain_graph=True) policy_gradient = parameters_to_vector([v.grad for v in self.policy.parameters()]).squeeze(0) assert policy_gradient.nonzero().size()[0] > 0, "Policy gradient is 0. Skipping update?.." # Use conjugate gradient algorithm to determine the step direction in theta space step_direction = self.conjugate_gradient(-policy_gradient.cpu().numpy()) # Do line search to determine the stepsize of theta in the direction of step_direction shs = step_direction.dot(self.hessian_vector_product(Tensor(step_direction)).cpu().numpy().T) / 2 lm = np.sqrt(shs / self.config.max_kl) fullstep = step_direction / lm gdotstepdir = -policy_gradient.dot(Tensor(step_direction)).data[0] theta = self.linesearch(parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir / lm) # Update parameters of policy model if any(np.isnan(theta.data.cpu().numpy())): raise Exception("NaN detected. Skipping update...") else: vector_to_parameters(theta, self.policy.parameters()) kl_old_new = self.mean_kl_divergence() self.logger["kl_change"].append(kl_old_new.item())
def line_search(self, gradients, states, actions, log_probs, rewards): step_size = (2 * self.kl_delta / gradients.dot( self.fisher_vector_direct(gradients, states))).sqrt() step_size_decay = 1.5 line_search_attempts = 10 # New policy current_parameters = parameters_to_vector(self.policy.parameters()) new_policy = deepcopy(self.policy) vector_to_parameters(current_parameters + step_size * gradients, new_policy.parameters()) new_std = self.logstd.detach() + step_size * self.logstd.grad # Shrink gradient until KL constraint met and improvement for attempt in range(line_search_attempts): # Obtain kl divergence and objective with torch.no_grad(): kl_value = self.kl(new_policy, new_std, states) objective = self.surrogate_objective(new_policy, new_std, states, actions, log_probs, rewards) # Shrink gradient if KL constraint not met or reward lower if kl_value > self.kl_delta or objective < 0: step_size /= step_size_decay vector_to_parameters( current_parameters + step_size * gradients, new_policy.parameters()) new_std = self.logstd.detach() + step_size * self.logstd.grad # Return new policy and std if KL and reward met else: return new_policy, new_std.requires_grad_() # Return old policy and std if constraints never met return self.policy, self.logstd
def train(env, policy, config): w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, config["cma_std"]) f = f_wrapper(env, policy) sdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f'agents/{config["session_ID"]}_ES_policy.p') print(f'N_params: {len(w)}') it = 0 try: while not es.stop(): it += 1 if it > config["iters"]: break X = es.ask() es.tell(X, [f(x) for x in X]) es.disp() except KeyboardInterrupt: print("User interrupted process.") vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy.state_dict(), sdir) print("Saved agent, {}".format(sdir)) return es.result.fbest
def surrogate_loss(self, theta): """ Returns the surrogate loss w.r.t. the given parameter vector theta """ theta = theta.detach() new_model = copy.deepcopy(self.model) # for param in new_model.parameters(): # print(param) vector_to_parameters(theta, new_model.policy_parameters()) if self.continuous: mean_new, std_new, _ = new_model(self.obs) mean_old, std_old, _ = self.model(self.obs) dis_new = Normal(mean_new, std_new) dis_old = Normal(mean_old, std_old) log_prob_new = dis_new.log_prob(self.acts).sum(-1, keepdim=True) log_prob_old = dis_old.log_prob(self.acts).sum(-1, keepdim=True) ratio = torch.exp(log_prob_new - log_prob_old).detach() else: probs_new, _ = new_model(self.obs) probs_old, _ = self.model(self.obs) dis_new = F.softmax(probs_new, dim=1) dis_old = F.softmax(probs_old, dim=1) probs_new = dis_new.gather(1, self.acts).detach() probs_old = dis_old.gather(1, self.acts).detach() + 1e-8 ratio = probs_new / probs_old return -torch.mean(ratio * self.advs)
def learn_htrpo(self): b_t = time.time() self.sample_batch() self.split_episode() # No valid episode is collected if self.n_valid_ep == 0: return self.generate_subgoals() if not self.using_original_data: self.reset_training_data() if self.sampled_goal_num is None or self.sampled_goal_num > 0: self.generate_fake_data() self.data_preprocess() self.other_data = self.goal # Optimize Value Estimator self.estimate_value() if self.value_type is not None: # update value for i in range(self.iters_v): self.update_value() # Optimize Policy # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0) # Likelihood Ratio # self.estimate_value() imp_fac = self.compute_imp_fac() if self.value_type: # old value estimator self.A = self.gamma_discount * self.hratio * self.A else: self.A = self.gamma_discount * self.A # Here mean() and sum() / self.n_traj is equivalent, because there # is only a coefficient between two expressions. This coefficient # will be compensated by the stepsize computation in TRPO. However, # in vanilla PG, there is no compensation, therefore, it needs to # be in the exact form of the euqation in the paper. self.loss = - (imp_fac * self.A).mean() - self.entropy_weight * self.compute_entropy() self.policy.zero_grad() loss_grad = torch.autograd.grad( self.loss, self.policy.parameters(), create_graph=True) # loss_grad_vector is a 1-D Variable including all parameters in self.policy loss_grad_vector = parameters_to_vector([grad for grad in loss_grad]) # solve Ax = -g, A is Hessian Matrix of KL divergence trpo_grad_direc = self.conjunction_gradient(- loss_grad_vector) shs = .5 * torch.sum(trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc)) beta = torch.sqrt(self.max_kl / shs) fullstep = trpo_grad_direc * beta gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc) theta = self.linear_search(parameters_to_vector( self.policy.parameters()), fullstep, gdotstepdir * beta) vector_to_parameters(theta, self.policy.parameters()) self.learn_step_counter += 1 self.cur_kl = self.mean_kl_divergence().item() self.policy_ent = self.compute_entropy().item() self.update_normalizer() print("iteration time: {:.4f}".format(time.time()-b_t))
def train(params): env_fun, iters, animate, camera, _ = params env = env_fun(animate=animate, camera=camera) obs_dim, act_dim = env.obs_dim, env.act_dim policy = NN(obs_dim, act_dim).float() w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, 0.5) f = f_wrapper(env, policy, animate) print( "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..." .format(env_fun.__name__, act_dim, obs_dim, len(w))) it = 0 try: while not es.stop(): it += 1 if it > iters: break if it % 1000 == 0: sdir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "agents/{}.p".format(env_fun.__name__)) vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy, sdir) print("Saved checkpoint") X = es.ask() es.tell(X, [f(x) for x in X]) es.disp() except KeyboardInterrupt: print("User interrupted process.") return es.result.fbest
def train_mt(params): env_fun, iters, animate, camera, model = params env = env_fun(animate=False, camera=camera) obs_dim, act_dim = env.obs_dim, env.act_dim policy = NN(obs_dim, act_dim).float() w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, 0.5) print( "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..." .format("Ant_reach", act_dim, obs_dim, len(w))) sims = [mujoco_py.MjSim(model) for _ in range(es.popsize)] policies = [policy] * es.popsize ctr = 0 try: while not es.stop(): ctr += 1 if ctr > iters: break if ctr % 1000 == 0: sdir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "agents/{}.p".format(env_fun.__name__)) vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy, sdir) print("Saved checkpoint") X = es.ask() output = mp.Queue() processes = [] for i, ef, sim, policy, x in zip(range(es.popsize), [env_fun] * es.popsize, sims, policies, X): processes.append( mp.Process(target=f_mp, args=(i, ef, sim, policy, x, output))) # Run processes for p in processes: p.start() # Exit the completed processes for p in processes: p.join() evals = [output.get() for _ in processes] evals.sort(key=lambda x: x[0]) evals = [ev[1] for ev in evals] es.tell(X, evals) es.disp() except KeyboardInterrupt: print("User interrupted process.") return es.result.fbest
def linesearch(self, x, fullstep, expected_improve_rate): """ Returns the parameter vector given by a linesearch input: x - Tensor, 1D, current parameters input: fullstep - Tensor, 1D, direction (natural gradient), normalized input: expected_improve_rate - ?!? output: new parameters - Tensor, 1D """ accept_ratio = 0.1 max_backtracks = 10 with torch.no_grad(): fval = self.surrogate_function().mean() for (_n_backtracks, stepfrac) in enumerate(0.5**np.arange(max_backtracks)): xnew = x.data.cpu().numpy() + stepfrac * fullstep vector_to_parameters(Tensor(xnew), self.policy.parameters()) with torch.no_grad(): newfval = self.surrogate_function().mean() actual_improve = fval - newfval expected_improve = expected_improve_rate * stepfrac ratio = actual_improve / expected_improve #print(actual_improve, " ", expected_improve) if ratio > accept_ratio and actual_improve > 0: #print("Accepted") self.logger["acceptance_ratio"].append(ratio) self.logger["expected_improvement"].append(expected_improve_rate) return Tensor(xnew) raise Exception("Line search error") return x
def learn(self): self.sample_batch() # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0) imp_fac = self.compute_imp_fac() self.estimate_value() self.A = (self.A - self.A.mean()) / (self.A.std() + 1e-8) self.loss = -(imp_fac * self.A ).mean() - self.entropy_weight * self.compute_entropy() if self.value_type is not None: # update value for i in range(self.iters_v): self.update_value() self.policy.zero_grad() loss_grad = torch.autograd.grad(self.loss, self.policy.parameters(), create_graph=True) # loss_grad_vector is a 1-D Variable including all parameters in self.policy loss_grad_vector = parameters_to_vector([grad for grad in loss_grad]) # solve Ax = -g, A is Hessian Matrix of KL divergence trpo_grad_direc = self.conjunction_gradient(-loss_grad_vector) shs = .5 * torch.sum( trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc)) beta = torch.sqrt(self.max_kl / shs) fullstep = trpo_grad_direc * beta gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc) theta = self.linear_search( parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir * beta) # update policy vector_to_parameters(theta, self.policy.parameters()) self.learn_step_counter += 1 self.cur_kl = self.mean_kl_divergence().item() self.policy_ent = self.compute_entropy().item()
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) step = grads / torch.norm(grads) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters())
def f(w): rewards = [] done = False obs, _ = env.reset() vector_to_parameters(torch.from_numpy(w).float(), policy.parameters()) while not done: # Get action from policy with torch.no_grad(): act = policy(my_utils.to_tensor(obs, True)) # Step environment obs, rew, done, od = env.step(act.squeeze(0).numpy()) if animate: env.render() rewards.append(od['rV']) r = 0 for rew in rewards: rew_arr = np.array(rew) r += rew_arr.sum() - np.abs(rew_arr - rew_arr.mean()).mean() return -r
def get_loss(self, theta, b_s, b_a, advantage): # get surrogate loss prob_old = self._policy(b_s).gather(1, b_a).data new_model = copy.deepcopy(self._policy) vector_to_parameters(theta, new_model.parameters()) prob_new = new_model(b_s).gather(1, b_a).data return -(prob_new / prob_old * advantage).mean()
def line_search_v2(self, theta): ''' line search to return the parameter vector ''' old_loss = self.surrogate_loss(theta) old_loss = Variable(old_loss, requires_grad=True) params = torch.cat( [param.view(-1) for param in self.pg_model.parameters()]) old_loss.backward(params) old_loss_grad = old_loss.grad s = self.conjugate_gradient(old_loss_grad) s = torch.from_numpy(s).float() beta = torch.sqrt(2 * self.delta / (s * old_loss_grad).sum()) beta_end = 0 decay = 100 alpha = 0.1 for d in range(decay): beta = beta * math.exp(-alpha * d) # shrink exponentially theta_new = theta + beta * s # compute objective new_loss = self.surrogate_loss(theta_new) new_model = deepcopy(self.pg_model) vector_to_parameters(theta_new, new_model.parameters()) mean_kl, _, _ = self.get_mean_kl_divergence(new_model) if mean_kl <= self.delta and new_loss < old_loss: # objective improve return theta_new return theta
def train(params): env, policy, iters, animate, ID = params w = parameters_to_vector(policy.parameters()).detach().numpy() es = cma.CMAEvolutionStrategy(w, 0.9) f = f_wrapper(env, policy, animate) it = 0 try: while not es.stop(): it += 1 if it > iters: break if it % 30 == 0: sdir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "agents/{}_es.p".format(ID)) vector_to_parameters( torch.from_numpy(es.result.xbest).float(), policy.parameters()) T.save(policy, sdir) print("Saved checkpoint, {}".format(sdir)) print(es.mean.min(), es.mean.max()) X = es.ask() es.tell(X, [f(x) for x in X]) es.disp() except KeyboardInterrupt: print("User interrupted process.") return es.result.fbest
def optim_value_lbfgs(self, V_target, inds): value = self.value value.zero_grad() loss_fn = self.loss_func_v def V_closure(): predicted = value(self.s[inds], other_data=self.other_data[inds] if self.other_data is not None else None).squeeze() loss = loss_fn(predicted, V_target) self.value_loss += loss.item() optimizer.zero_grad() loss.backward() return loss old_params = parameters_to_vector(value.parameters()) for lr in self.lr * .5**np.arange(10): optimizer = optim.LBFGS(self.value.parameters(), lr=lr) optimizer.step(V_closure) current_params = parameters_to_vector(value.parameters()) if any(np.isnan(current_params.data.cpu().numpy())): print("LBFGS optimization diverged. Rolling back update...") vector_to_parameters(old_params, value.parameters()) else: return
def generate(self): if self.candidate_idx == self.pop_size: # Tell candidate scores self.es.tell(self.candidates, self.candidate_scores) self.candidates = self.es.ask(self.pop_size) if self.weight_decay > 0: self.candidates = [ self.decay(c, self.weight_decay) for c in self.candidates ] self.candidate_scores = [] self.candidate_idx = 0 self.es.disp() candidate = self.candidates[self.candidate_idx] self.candidate_idx += 1 vector_to_parameters( torch.from_numpy(candidate).float(), self.convnet.parameters()) seed_noise = T.randn(1, self.noise_dim) with T.no_grad(): mat = self.convnet(seed_noise)[0].numpy() mat = self.normalize_map(mat) mat[0, :] = 255 mat[:, 0] = 255 mat[-1, :] = 255 mat[:, -1] = 255 cv2.imwrite(self.filename, mat)
def step(self, H, step_size=1, closure=None): # literally no idea what this does loss = None if closure is not None: loss = closure() # set parameters params = [p for p in self.param_groups[0]['params']] grads = [p.grad for p in params] # convert parameters to a vector param_vector = parameters_to_vector(params) grad_vector = parameters_to_vector(grads) # apply rotation / contract / expansion soln, _ = torch.solve( grad_vector.unsqueeze(1).unsqueeze(0), H.unsqueeze(0)) scaled_gradient = soln[0].reshape(-1) # add the charactoristic scaling scaling = torch.dot(scaled_gradient, soln.reshape(-1)) scaled_gradient *= step_size * torch.sqrt(self.divergence_limit / (scaling + self.epsilon)) # check that the scaling is ok before updating parameters if scaling > 0.: # update the gradient weights vector_to_parameters(scaled_gradient, grads) # now we can perform the update for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone( d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss
def init_param(self, init_values: to.Tensor = None, **kwargs): # See RNNPolicyBase if init_values is None: # Initialize the layers using default initialization init_param(self.rnn_layers, **kwargs) init_param(self.output_layer, **kwargs) else: cp.vector_to_parameters(init_values, self.parameters())
def object_loss(self, theta): model = copy.deepcopy(self.policy) vector_to_parameters(theta, model.parameters()) imp_fac = self.compute_imp_fac(model=model) loss = -(imp_fac * self.A).mean() - self.entropy_weight * self.compute_entropy() curkl = self.mean_kl_divergence(model=model) return loss, curkl
def get_best(self): with torch.no_grad(): vector_to_parameters( torch.from_numpy(self.es.result.xbest).float(), self.convnet.parameters()) sol = self.convnet(np.random.randn( self.noise_dim)).squeeze(0).numpy() return self.normalize_map(sol)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] # HESSIAN VEC COMPUTATION # vectorize all parameters grad_vec = parameters_to_vector(group['params']) # create noise vector noise = torch.normal(means=torch.zeros_like(grad_vec), std=self.noise_factor) # compute the product grad_product = torch.sum(grad_vec * noise) grad_grad = torch.autograd.grad( grad_product, group['params'], retain_graph=True ) # h_v_p = hessian_vec_product fisher_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad]) hessian_vec_prod = fisher_vec_prod + (self.cg_damping * noise) for p in group['params']: if p.grad is None: continue grad = p.grad d_p = p.grad.clone().data # REST OF SGD STUFF if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) flattened = parameters_to_vector(group['params']) flattened.data.add_(group['lr'], hessian_vec_prod.data) vector_to_parameters(flattened, group['params']) return loss
def optimize(self): self.total_steps += self.steps_per_train if self.total_steps >= self.learning_start: experience_sample = ray.get( self.experience_replay.sample.remote(self.batch_size)) state = torch.cat([ torch.from_numpy(s.state).cuda().unsqueeze(0) for s in experience_sample ]) next_state = torch.cat([ torch.from_numpy(s.next_state).cuda().unsqueeze(0) for s in experience_sample ]) terminal = (torch.tensor([s.terminal for s in experience_sample ]).cuda().unsqueeze(1)) reward = (torch.tensor([s.reward for s in experience_sample ]).cuda().unsqueeze(1)) action = torch.tensor([s.action for s in experience_sample]).cuda() # Train value function target = ( reward + self.gamma * (1 - terminal) * self.target_value_fn( next_state, self.target_policy(next_state))).detach() actual = self.online_value_fn(state, action) value_fn_loss = self.value_fn_criterion(target, actual) value_fn_loss.backward() self.value_fn_opt.step() self.online_policy.zero_grad() self.online_value_fn.zero_grad() # Train policy policy_loss = -self.online_value_fn( state, self.online_policy(state)).mean() policy_loss.backward() self.policy_opt.step() self.online_policy.zero_grad() self.online_value_fn.zero_grad() # Update target networks v_policy = parameters_to_vector(self.online_policy.parameters()) v_policy_targ = parameters_to_vector( self.target_policy.parameters()) new_v_policy_targ = (self.polyak * v_policy_targ + (1 - self.polyak) * v_policy) vector_to_parameters(new_v_policy_targ, self.target_policy.parameters()) v_value_fn = parameters_to_vector( self.online_value_fn.parameters()) v_value_fn_targ = parameters_to_vector( self.target_value_fn.parameters()) new_v_value_fn_targ = (self.polyak * v_value_fn_targ + (1 - self.polyak) * v_value_fn) vector_to_parameters(new_v_value_fn_targ, self.target_value_fn.parameters())
def _pes_sample_one(G, param): """ Sample one rollout with the current setting. """ pol_param, dom_param, init_state = param vector_to_parameters(pol_param, G.policy.parameters()) return rollout(G.env, G.policy, reset_kwargs={ 'init_state': init_state, 'domain_param': dom_param, })
def learn(self, env, max_iter, batch_size): for i_iter in xrange(max_iter): s = env.reset() self._noise_generator.reset() done = False add_noise = i_iter * 1.0 / max_iter < self.explore_fraction e_reward = 0 while not done: # env.render() noise = torch.FloatTensor( self._noise_generator.generate()) if add_noise else None a = self.act(s, noise=noise) s_, r, done, info = env.step(a) self._replay_module.add(tuple((s, a, [r], s_, [int(done)]))) s = s_ e_reward += r if len(self._replay_module) < self.warmup_size: continue # sample batch transitions b_s, b_a, b_r, b_s_, b_d = self._replay_module.sample( batch_size) b_s = numpy.vstack(b_s) b_a = numpy.vstack(b_a) b_s, b_a, b_r, b_d = map( lambda ryo: Variable(torch.FloatTensor(ryo)), [b_s, b_a, b_r, b_d]) b_s_ = Variable(torch.FloatTensor(b_s_), volatile=True) # update critic self._optimizer_critic.zero_grad() y = b_r + self.reward_gamma * self._target_critic( b_s_, self._target_actor(b_s_)) * (1 - b_d) loss = self.loss(self._critic(b_s, b_a), y) loss.backward() self._optimizer_critic.step() # update actor self._optimizer_actor.zero_grad() loss = -self._critic( b_s, self._actor(b_s)).mean() # dpg, eq6 in [1] loss.backward() self._optimizer_actor.step() # update target networks for target, normal in [(self._target_actor, self._actor), (self._target_critic, self._critic)]: target_vec = parameters_to_vector(target.parameters()) normal_vec = parameters_to_vector(normal.parameters()) vector_to_parameters( (1 - self.tau) * target_vec + self.tau * normal_vec, target.parameters()) logger.info('Iter: {}, E_Reward: {}'.format( i_iter, round(e_reward, 2)))
def surrogate_loss(self, theta): """ Returns the surrogate loss w.r.t. the given parameter vector theta """ new_model = copy.deepcopy(self.policy_model) vector_to_parameters(theta, new_model.parameters()) observations_tensor = self.observations prob_new = new_model(observations_tensor).gather( 1, torch.cat(self.actions)).data prob_old = self.policy_model(observations_tensor).gather( 1, torch.cat(self.actions)).data + 1e-8 return -torch.mean((prob_new / prob_old) * self.advantage)
def step(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ old_loss, _, old_pis = self.surrogate_loss(episodes) print('old_loss: ', old_loss) # although old_loss is e-8 magnitude, grads is not very small grads = torch.autograd.grad(old_loss, self.policy.parameters()) grads = parameters_to_vector(grads) print('grads: ', grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = self.hessian_vector_product( episodes, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot( stepdir, hessian_vector_product(stepdir)) # dot of 3 matrices, sT.H.s lagrange_multiplier = torch.sqrt(shs / max_kl) '''? neglect difference of pi and old_pi?''' # step is only calculated once with all ratio to be 1. step = stepdir / lagrange_multiplier print('step: ', step) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): # assign values to policy network parameters # step is fixed during line search vector_to_parameters(old_params - step_size * step, self.policy.parameters()) # print('oldpis: ', old_pis) loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters())
def _line_search(self, old_loss, loss_grad, step_vector_x, advantage_batch, s_batch, old_policy, a_batch): old_actor = copy.deepcopy(self.actor) actor_flat_params = parameters_to_vector(self.actor.parameters()) expected_improve = (loss_grad * step_vector_x).sum(0, keepdim=True) expected_improve = expected_improve.cpu().numpy() i, line_search_succeed = -1, False for i in range(self.backtrack_iters): # 라인 서치로 정책 업데이트 backtrack_ratio = self.backtrack_coeff**i constraint_params = actor_flat_params + backtrack_ratio * step_vector_x vector_to_parameters(constraint_params, self.actor.parameters()) # 바꾼 actor를 기반으로 다시 평균(log정책(a|s)*A) 구해봄 meow, logstd, std = self.actor(s_batch) new_policy = self._log_density(a_batch, meow, std, logstd) constraint_loss = self._surrogate_loss( old_policy=old_policy, new_policy=new_policy, advantage_batch=advantage_batch) loss_improve = (constraint_loss - old_loss).detach().cpu().numpy() weighted_expected_improve = backtrack_ratio * expected_improve kl = kl_divergence(new_actor=self.actor, old_actor=old_actor, s_batch=s_batch) kl = kl.mean() TrainerMetadata().log(kl, 'KL', 'current_kl', compute_maxmin=True) TrainerMetadata().log(self.max_kl, 'KL', 'max_kl') TrainerMetadata().log(loss_improve / weighted_expected_improve, 'real / expected (improve)', 'real_ratio', compute_maxmin=True) TrainerMetadata().log(0.5, 'real / expected (improve)', 'threshold ') # TrainerMetadata().log(expected_improve, 'expected_improve', compute_maxmin=True) # see https://en.wikipedia.org/wiki/Backtracking_line_search # TODO: 0.5 인 이유? 1.0 보다 커야 개선된 것 아닌가 # 일단 Armijo used 1⁄2 for both c and tau in a paper he published in 1966 if kl < self.max_kl and (loss_improve / weighted_expected_improve) > 0.5: line_search_succeed = True break TrainerMetadata().console_log('KL_iter', i) if not line_search_succeed: self.actor = copy.deepcopy(old_actor) print('policy update does not impove the surrogate')
def f(w): reward = 0 done = False obs = env.reset() vector_to_parameters(torch.from_numpy(w).float(), policy.parameters()) while not done: with torch.no_grad(): act = policy.sample_action(obs) obs, rew, done, _ = env.step(act) reward += rew return -reward
def step_test(self, episodes, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): """Meta-optimization step (ie. update of the initial parameters), based on Trust Region Policy Optimization (TRPO, [4]). """ grads = self.compute_ng_gradient_test(episodes) old_params = parameters_to_vector(self.policy.parameters()) update_params = self.adam_step(old_params, grads) vector_to_parameters(update_params, self.policy.parameters())