def cpd_newton(size, rank): dim = 3 for datatype in BACKEND_TYPES: T.set_backend(datatype) A_list, input_tensor, loss, residual = cpd_graph(dim, size, rank) A, B, C = A_list v_A = ad.Variable(name="v_A", shape=[size, rank]) v_B = ad.Variable(name="v_B", shape=[size, rank]) v_C = ad.Variable(name="v_C", shape=[size, rank]) grads = ad.gradients(loss, [A, B, C]) Hvps = ad.hvp(output_node=loss, node_list=[A, B, C], vector_list=[v_A, v_B, v_C]) executor_grads = ad.Executor([loss] + grads) executor_Hvps = ad.Executor(Hvps) A_list, input_tensor_val = init_rand_cp(dim, size, rank) A_val, B_val, C_val = A_list for i in range(100): def hess_fn(v): return executor_Hvps.run( feed_dict={ A: A_val, B: B_val, C: C_val, input_tensor: input_tensor_val, v_A: v[0], v_B: v[1], v_C: v[2] }) loss_val, grad_A_val, grad_B_val, grad_C_val = executor_grads.run( feed_dict={ A: A_val, B: B_val, C: C_val, input_tensor: input_tensor_val }) delta = conjugate_gradient( hess_fn=hess_fn, grads=[grad_A_val, grad_B_val, grad_C_val], error_tol=1e-9, max_iters=250) A_val -= delta[0] B_val -= delta[1] C_val -= delta[2] print(f'At iteration {i} the loss is: {loss_val}')
def update_agent(rollouts: List[Rollout]) -> None: states = torch.cat([r.states for r in rollouts], dim=0) actions = torch.cat([r.actions for r in rollouts], dim=0).flatten() advantages = [estimate_advantages(critic, states, next_states[-1], rewards) for states, _, rewards, next_states in rollouts] advantages = normalize(torch.cat(advantages, dim=0).flatten()) update_critic(advantages) distribution = actor(states) distribution = torch.distributions.utils.clamp_probs(distribution) probabilities = distribution[range(distribution.shape[0]), actions] # Now we have all the data we need for the algorithm # We will calculate the gradient wrt to the new probabilities (surrogate function), # so second probabilities should be treated as a constant L = surrogate_loss(probabilities, probabilities.detach(), advantages) KL = kl_div(distribution, distribution) parameters = list(actor.parameters()) g = flat_grad(L, actor.parameters(), retain_graph=True) d_kl = flat_grad(KL, parameters, create_graph=True) # Create graph, because we will call backward() on it (for HVP) def HVP(v): return flat_grad(d_kl @ v, parameters, retain_graph=True) search_dir = conjugate_gradient(HVP, g) max_length = torch.sqrt(2 * delta / (search_dir @ HVP(search_dir))) max_step = max_length * search_dir def criterion(step): apply_update(step) with torch.no_grad(): distribution_new = actor(states) distribution_new = torch.distributions.utils.clamp_probs(distribution_new) probabilities_new = distribution_new[range(distribution_new.shape[0]), actions] L_new = surrogate_loss(probabilities_new, probabilities, advantages) KL_new = kl_div(distribution, distribution_new) L_improvement = L_new - L if L_improvement > 0 and KL_new <= delta: return True apply_update(-step) return False line_search(max_step, criterion, max_iterations=10)
def apply_step(self, *args): loss_g, loss_h = args[:2] for x in self.params: g = jacobian(loss_g, x) h = hessian(loss_h, x) with torch.no_grad(): g = g.reshape((-1, 1)) h = h.reshape((g.shape[0], g.shape[0])) dx = conjugate_gradient(h, g, n_iterations=self.n_cg, tol=self.tol).reshape(x.shape) x.add_(dx, alpha=-self.lr)
def test_HinverseG(backendopt): for datatype in backendopt: T.set_backend(datatype) N = 10 T.seed(1224) A = T.random([N, N]) A = T.transpose(A) @ A A = A + T.identity(N) b = T.random([N]) def hess_fn(x): return [T.einsum("ab,b->a", A, x[0])] error_tol = 1e-9 x, = conjugate_gradient(hess_fn, [b], error_tol) assert (T.norm(T.abs(T.einsum("ab,b->a", A, x) - b)) <= 1e-4)
def train(self): start_time = time.time() self.episodes = self.env.generate_episodes(config.NUM_EPISODES, self) # Computing returns and estimating advantage function. for episode in self.episodes: episode["baseline"] = self.value_func.predict(episode) episode["returns"] = utils.discount(episode["rewards"], config.GAMMA) episode["advantage"] = episode["returns"] - episode["baseline"] # Updating policy. actions_dist_n = np.concatenate( [episode["actions_dist"] for episode in self.episodes]) states_n = np.concatenate( [episode["states"] for episode in self.episodes]) actions_n = np.concatenate( [episode["actions"] for episode in self.episodes]) baseline_n = np.concatenate( [episode["baseline"] for episode in self.episodes]) returns_n = np.concatenate( [episode["returns"] for episode in self.episodes]) # Standardize the advantage function to have mean=0 and std=1. advantage_n = np.concatenate( [episode["advantage"] for episode in self.episodes]) advantage_n -= advantage_n.mean() advantage_n /= (advantage_n.std() + 1e-8) # Computing baseline function for next iter. print(states_n.shape, actions_n.shape, advantage_n.shape, actions_dist_n.shape) feed = { self.policy.state: states_n, self.action: actions_n, self.advantage: advantage_n, self.policy.pi_theta_old: actions_dist_n } episoderewards = np.array( [episode["rewards"].sum() for episode in self.episodes]) #print("\n********** Iteration %i ************" % i) self.value_func.fit(self.episodes) self.theta_old = self.current_theta() def fisher_vector_product(p): feed[self.flat_tangent] = p return self.session.run(self.fisher_vect_prod, feed) + config.CG_DAMP * p self.g = self.session.run(self.surr_loss_grad, feed_dict=feed) self.grad_step = utils.conjugate_gradient(fisher_vector_product, -self.g) self.sAs = .5 * self.grad_step.dot( fisher_vector_product(self.grad_step)) self.beta_inv = np.sqrt(self.sAs / config.MAX_KL) self.full_grad_step = self.grad_step / self.beta_inv self.negdot_grad_step = -self.g.dot(self.grad_step) def loss(th): self.set_theta(th) return self.session.run(self.surr_loss, feed_dict=feed) self.theta = utils.line_search(loss, self.theta_old, self.full_grad_step, self.negdot_grad_step / self.beta_inv) self.set_theta(self.theta) surr_loss_new = -self.session.run(self.surr_loss, feed_dict=feed) KL_old_new = self.session.run(self.KL, feed_dict=feed) entropy = self.session.run(self.entropy, feed_dict=feed) old_new_norm = np.sum((self.theta - self.theta_old)**2) if np.abs(KL_old_new) > 2.0 * config.MAX_KL: print("Keeping old theta") self.set_theta(self.theta_old) stats = {} stats["L2 of old - new"] = old_new_norm stats["Total number of episodes"] = len(self.episodes) stats["Average sum of rewards per episode"] = episoderewards.mean() stats["Entropy"] = entropy exp = utils.explained_variance(np.array(baseline_n), np.array(returns_n)) stats["Baseline explained"] = exp stats["Time elapsed"] = "%.2f mins" % ( (time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = KL_old_new stats["Surrogate loss"] = surr_loss_new self.stats.append(stats) utils.write_dict(stats) save_path = self.saver.save(self.session, "./checkpoints/model.ckpt") print('Saved checkpoint to %s' % save_path) for k, v in stats.items(): print(k + ": " + " " * (40 - len(k)) + str(v))
def test(learner, args, train_envs, test_envs, log_dir): learner_test = network(args.num_layers, args.num_hidden, args.num_bandits) batch_sampler = sampler(args.batch_size, args.num_bandits) max_kl = args.max_kl cg_iters = args.cg_iters cg_damping = args.cg_damping ls_max_steps = args.ls_max_steps ls_backtrack_ratio = args.ls_backtrack_ratio train_rew = [] for i in range(args.num_updates): #print(i) adapt_params = [] inner_losses = [] adapt_episodes = [] rew_rem = [] for j in range(args.num_tasks_train): e = batch_sampler.sample(train_envs[j], learner) inner_loss = learner.cal_loss(e.s, e.a, e.r) params = learner.update_params(inner_loss, args.inner_lr, args.first_order) a_e = batch_sampler.sample(train_envs[j], learner, params) adapt_params.append(params) adapt_episodes.append(a_e) inner_losses.append(inner_loss) mean_rew = torch.mean(a_e.r).data.numpy() rew_rem.append(mean_rew) print(np.mean(rew_rem)) train_rew.append(np.mean(rew_rem)) old_loss, _, old_pis = learner.surrogate_loss(adapt_episodes, inner_losses) grads = torch.autograd.grad(old_loss, learner.parameters(), retain_graph=True) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient hessian_vector_product = learner.hessian_vector_product( adapt_episodes, inner_losses, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(learner.parameters()) # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, learner.parameters()) loss, kl, _ = learner.surrogate_loss(adapt_episodes, inner_losses, old_pis=old_pis) improve = loss - old_loss if (improve.item() < 0.0) and (kl.item() < max_kl): break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, learner.parameters()) if (i + 1) % 10 == 0: test_input = torch.FloatTensor([[1]]) test_output = learner.forward(test_input).data.numpy()[0] plt.figure() plt.bar(np.arange(len(test_output)), test_output) plt.savefig(log_dir + 'figures/before%i.png' % i) plt.close() for j in range(args.num_tasks_train): test_output = learner.forward(test_input, adapt_params[j]).data.numpy()[0] plt.figure() plt.bar(np.arange(len(test_output)), test_output) plt.savefig(log_dir + 'figures/after%i_%i.png' % (j, i)) plt.close() np.save(log_dir + 'train_rew' + str(args.inner_lr) + '.npy', train_rew) plt.figure() plt.plot(train_rew) plt.show() plt.figure() plt.plot(train_rew) plt.savefig(log_dir + 'train_rew.png') return
def learn(self, paths): # is it possible to replace A(s,a) with Q(s,a)? for path in paths: path["baseline"] = self.vf.predict(path) path["returns"] = utils.discount(path["rewards"], self.args.gamma) path["advantage"] = path["returns"] - path["baseline"] # path["advantage"] = path["returns"] # puts all the experiences in a matrix: total_timesteps x options action_dist_mu = np.concatenate( [path["action_dists_mu"] for path in paths]) action_dist_logstd = np.concatenate( [path["action_dists_logstd"] for path in paths]) obs_n = np.concatenate([path["obs"] for path in paths]) action_n = np.concatenate([path["actions"] for path in paths]) # standardize to mean 0 stddev 1 advant_n = np.concatenate([path["advantage"] for path in paths]) advant_n -= advant_n.mean() advant_n /= (advant_n.std() + 1e-8) # train value function / baseline on rollout paths self.vf.fit(paths) feed_dict = { self.obs: obs_n, self.action: action_n, self.advantage: advant_n, self.oldaction_dist_mu: action_dist_mu, self.oldaction_dist_logstd: action_dist_logstd } # parameters thprev = self.gf() # computes fisher vector product: F * [self.pg] def fisher_vector_product(p): feed_dict[self.flat_tangent] = p return self.session.run(self.fvp, feed_dict) + p * self.args.cg_damping g = self.session.run(self.pg, feed_dict) # solve Ax = g, where A is Fisher information metrix and g is gradient of parameters # stepdir = A_inverse * g = x stepdir = utils.conjugate_gradient(fisher_vector_product, -g) # let stepdir = change in theta / direction that theta changes in # KL divergence approximated by 0.5 x stepdir_transpose * [Fisher Information Matrix] * stepdir # where the [Fisher Information Matrix] acts like a metric # ([Fisher Information Matrix] * stepdir) is computed using the function, # and then stepdir * [above] is computed manually. shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / self.args.max_kl) # if self.args.max_kl > 0.001: # self.args.max_kl *= self.args.kl_anneal fullstep = stepdir / lm negative_g_dot_steppdir = -g.dot(stepdir) def loss(th): self.sff(th) # surrogate loss: policy gradient loss return self.session.run(self.losses[0], feed_dict) # finds best parameter by starting with a big step and working backwards theta = utils.linesearch(loss, thprev, fullstep, negative_g_dot_steppdir / lm) # i guess we just take a fullstep no matter what theta = thprev + fullstep self.sff(theta) surrogate_after, kl_after, entropy_after = self.session.run( self.losses, feed_dict) episoderewards = np.array([path["rewards"].sum() for path in paths]) stats = {} stats["Average sum of rewards per episode"] = episoderewards.mean() stats["Entropy"] = entropy_after stats["max KL"] = self.args.max_kl stats["Timesteps"] = sum([len(path["rewards"]) for path in paths]) # stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = kl_after stats["Surrogate loss"] = surrogate_after # print(("\n********** Iteration {} ************".format(i))) for k, v in stats.items(): print(k + ": " + " " * (40 - len(k)) + str(v)) return stats["Average sum of rewards per episode"]