def train_paths(self, paths, parallel=False, linear_search=True): start_time = time.time() sample_data = self.storage.process_paths(paths) agent_infos = sample_data["agent_infos"] obs_n = sample_data["observations"] action_n = sample_data["actions"] advant_n = sample_data["advantages"] n_samples = len(obs_n) inds = np.random.choice(n_samples , int(math.floor(n_samples * pms.subsample_factor)) , replace=False) # inds = range(n_samples) obs_n = obs_n[inds] action_n = action_n[inds] advant_n = advant_n[inds] action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]]) action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]]) feed = {self.net.obs: obs_n , self.net.advant: advant_n , self.net.old_dist_means_n: action_dist_means_n , self.net.old_dist_logstds_n: action_dist_logstds_n , self.net.action_n: action_n } episoderewards = np.array([path["rewards"].sum() for path in paths]) thprev = self.gf() # get theta_old def fisher_vector_product(p): feed[self.flat_tangent] = p return self.session.run(self.fvp , feed) + pms.cg_damping * p g = self.session.run(self.pg , feed_dict=feed) stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters) shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta # if shs<0, then the nan error would appear lm = np.sqrt(shs / pms.max_kl) fullstep = stepdir / lm neggdotstepdir = -g.dot(stepdir) def loss(th): self.sff(th) return self.session.run(self.losses , feed_dict=feed) if parallel is True: theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm) else: if linear_search: theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm) else: theta = thprev + fullstep if math.isnan(theta.mean()): print shs is None theta = thprev stats = {} stats["sum steps of episodes"] = sample_data["sum_episode_steps"] stats["Average sum of rewards per episode"] = episoderewards.mean() stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) return stats , theta , thprev
def sparse_cg(data): A, E, M, _, y0 = data def matvec(x): Ax = A.dot(x) return A.T.dot(sparse.linalg.spsolve(M, Ax)) + E.T.dot(E.dot(x)) lop = LinearOperator((E.shape[1], E.shape[1]), dtype=float, matvec=matvec) ET_b = E.T.dot(y0) try: out = krylov.cg(lop, ET_b, tol=1.0e-10, maxiter=10000) x = out.xk except krypy.utils.ConvergenceError: x = np.nan return x
def _solve_sparse_cg(A, M, E, y0): def matvec(x): Ax = A.dot(x) return A.T.dot(sparse.linalg.spsolve(M, Ax)) + E.T.dot(E.dot(x)) lop = LinearOperator(shape=(E.shape[1], E.shape[1]), dtype=float, matvec=matvec) ET_b = E.T.dot(y0) x, _ = krylov.cg(lop, ET_b, tol=1.0e-10, maxiter=1000) # import matplotlib.pyplot as plt # plt.semilogy(out.resnorms) # plt.grid() # plt.show() return x
def learn(self): start_time = time.time() numeptotal = 0 while True: i = 0 # Generating paths. # print("Rollout") self.get_samples(pms.paths_number) paths = self.storage.get_paths() # get_paths # Computing returns and estimating advantage function. sample_data = self.storage.process_paths(paths) agent_infos = sample_data["agent_infos"] obs_n = sample_data["observations"] action_n = sample_data["actions"] advant_n = sample_data["advantages"] n_samples = len(obs_n) inds = np.random.choice(n_samples, math.floor(n_samples * pms.subsample_factor), replace=False) obs_n = obs_n[inds] action_n = action_n[inds] advant_n = advant_n[inds] action_dist_means_n = np.array( [agent_info["mean"] for agent_info in agent_infos[inds]]) action_dist_logstds_n = np.array( [agent_info["log_std"] for agent_info in agent_infos[inds]]) feed = { self.network.obs: obs_n, self.network.advant: advant_n, self.network.old_dist_means_n: action_dist_means_n, self.network.old_dist_logstds_n: action_dist_logstds_n, self.network.action_dist_logstds_n: action_dist_logstds_n, self.network.action_n: action_n } episoderewards = np.array( [path["rewards"].sum() for path in paths]) average_episode_std = np.mean(np.exp(action_dist_logstds_n)) # print "\n********** Iteration %i ************" % i for iter_num_per_train in range(pms.iter_num_per_train): # if not self.train: # print("Episode mean: %f" % episoderewards.mean()) # self.end_count += 1 # if self.end_count > 100: # break if self.train: thprev = self.gf() # get theta_old def fisher_vector_product(p): feed[self.flat_tangent] = p return self.session.run(self.fvp, feed) + pms.cg_damping * p g = self.session.run(self.pg, feed_dict=feed) stepdir = krylov.cg(fisher_vector_product, g, cg_iters=pms.cg_iters) shs = 0.5 * stepdir.dot( fisher_vector_product(stepdir)) # theta fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs) neggdotstepdir = -g.dot(stepdir) def loss(th): self.sff(th) return self.session.run(self.losses, feed_dict=feed) surr_prev, kl_prev, ent_prev = loss(thprev) mean_advant = np.mean(advant_n) theta = linesearch(loss, thprev, fullstep, neggdotstepdir) self.sff(theta) surrafter, kloldnew, entnew = self.session.run( self.losses, feed_dict=feed) stats = {} numeptotal += len(episoderewards) stats["average_episode_std"] = average_episode_std stats["sum steps of episodes"] = sample_data[ "sum_episode_steps"] stats["Total number of episodes"] = numeptotal stats[ "Average sum of rewards per episode"] = episoderewards.mean( ) # stats["Entropy"] = entropy # exp = explained_variance(np.array(baseline_n), np.array(returns_n)) # stats["Baseline explained"] = exp stats["Time elapsed"] = "%.2f mins" % ( (time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = kloldnew stats["Surrogate loss"] = surrafter stats["Surrogate loss prev"] = surr_prev stats["entropy"] = ent_prev stats["mean_advant"] = mean_advant log_data = [ average_episode_std, len(episoderewards), numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev, surrafter - surr_prev, ent_prev, mean_advant ] self.master.logger.log_row(log_data) # for k, v in stats.iteritems(): # print(k + ": " + " " * (40 - len(k)) + str(v)) # # if entropy != entropy: # # exit(-1) # # if exp > 0.95: # # self.train = False if self.thread_id == 1: self.master.save_model("iter" + str(i)) print episoderewards.mean() i += 1
def optimize(self, sess, samples_data): feed_dict = dict({ self.local_policy.state: samples_data['observations'], self.local_policy.actions: samples_data['actions'], self.local_policy.advantage: samples_data['advantages'], self.local_policy.old_mean: samples_data['mean'], self.local_policy.old_log_std: samples_data['log_std'], self.local_policy.w: samples_data['ws'] }) if self.verbose >= 1: logging.info(str(self.name) + " " + "computing loss before") logging.info(str(self.name) + " " + "performing update") logging.info(str(self.name) + " " + "computing descent direction") #calculate loss gradient g using symbolic experssion [self.flat_gradient, loss_before] = sess.run( [self.local_policy.grads_flatten, self.local_policy.surr_loss], feed_dict=feed_dict) #calculate s = A^-1*g by A*g descent_direction = cg(self.local_policy, self.flat_gradient, feed_dict, sess, reg_coef=self.cg_reg_coef, cg_iters=self.cg_iterations, verbose=False) #calculate A*s A_dot_descent_direction = sess.run(self.local_policy.fisher_prod_x_flatten, feed_dict = dict(feed_dict, **{self.local_policy.xs_flatten:descent_direction})) +\ self.cg_reg_coef * descent_direction #calculate line search step = sqrt(2kl/sAs) initial_step_size = np.sqrt( 2.0 * self.kl_step_size * (1. / (np.abs(descent_direction.dot(A_dot_descent_direction)) + 1e-8))) #initial descent step for line search flat_descent_step = initial_step_size * descent_direction if self.verbose >= 1: logging.info(str(self.name) + " " + "descent direction computed") prev_param = sess.run(self.local_policy.get_params, feed_dict=feed_dict) if self.verbose >= 1: logging.info( str(self.name) + " " + "current log_std: {0}".format(prev_param[-1])) #perform line search along flat_descent_step direction to ensure kl divergence smaller than kl_step_size #shrink descent step by self.backtrack_ratio loss_after = 0 kl_after = 0 for n_iter, ratio in enumerate(self.backtrack_ratio**np.arange( self.max_backtracks)): cur_step = ratio * flat_descent_step start = 0 cur_param = [] for param in prev_param: size = param.flatten().shape[0] cur_param.append(param - cur_step[start:start + size].reshape(param.shape)) start += size sess.run( self.local_policy.assign_op, feed_dict={ i: d for i, d in zip(self.local_policy.assign_value, cur_param) }) loss_after, kl_after, log_std = sess.run([ self.local_policy.surr_loss, self.local_policy.mean_kl, self.local_policy.log_std ], feed_dict=feed_dict) if np.isnan(kl_after): import ipdb ipdb.set_trace() if loss_after < loss_before and kl_after <= self.kl_step_size: break if self.verbose >= 1: logging.info(str(self.name) + " " + "backtrack iters: %d" % n_iter) logging.info(str(self.name) + " " + "optimization finished") return loss_before, loss_after, kl_after, ratio * flat_descent_step, log_std