def gps_cartpole(res_dir): experiment = cartpole_known env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) policy = TimeIndexedLinearGaussianPolicy( 0 * np.eye(model.dim_u), experiment.N_DURATION, model.dim_u, model.dim_x) alpha = 1e3 Q = experiment.INFERENCE.Q / alpha R = experiment.INFERENCE.R / alpha xg = model.xag.squeeze() def cost(x, u, a, x_lin): _J = cartpole_dobs(x_lin) _j = cartpole_obs(x_lin) - _J @ x_lin _x = _J @ x + _j return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u gps = GuidedPolicySearch( model, cost, experiment.N_DURATION, kl_bound=1.0, u_lim=5, init_ctl_sigma=1.25, init_noise=1e-1, activation='all') cost = gps.run( 200, ) x, u, c = gps.forward_pass(gps.ctl) xu = np.vstack((x.mu[:,:-1], u.mu)).T env.plot_sim(xu, None, "gps cartpole internal", res_dir) cost = [c * alpha for c in cost] policy.K = np.zeros(policy.K.shape) policy.k = gps.udist.mu.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "gps cartpole feedforward", None) policy.K = gps.ctl.K.reshape(policy.K.shape) policy.k = gps.ctl.kff.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "gps cartpole", None) x = xu[:, :model.dim_x] u = xu[:, model.dim_x:] gps.plot_trajectory() f,a = plt.subplots(2) a[0].plot(cost, '.-') return x, u, cost, gps
def run(experiment, res_dir, weight_path): env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) i2c = I2cGraph(model, experiment.N_DURATION, experiment.INFERENCE.Q, experiment.INFERENCE.R, experiment.INFERENCE.ALPHA, experiment.INFERENCE.alpha_update_tol, experiment.INFERENCE.SIG_U, experiment.INFERENCE.msg_iter, experiment.INFERENCE.msg_tol, experiment.INFERENCE.em_tol, experiment.INFERENCE.backwards_contraction, res_dir) if weight_path is not None: print("Loading i2c model with {}".format(weight_path)) i2c.sys.model.load(weight_path) policy = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR, experiment.N_DURATION, i2c.sys.dim_u, i2c.sys.dim_x) # collection of data traj_data = TrajectoryData(model.x_noise, model.y_noise, experiment.N_AUG) s_est = np.zeros((experiment.N_DURATION, i2c.sys.dim_xt)) traj_eval = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg) traj_eval_iter = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg) # get stationary data for initial training data for _ in range(experiment.N_STARTING): x, y, _ = env.run(policy) traj_data.add(x, y) # MBRL training loop for j in range(experiment.N_EPISODE): print("Ep. {}".format(j)) # run policy on env, get data with profile("Simulation", PROFILING): x, y, _ = env.run(policy) x_test, y_test, _ = env.run(policy) env.plot_sim(x, s_est, "training {}".format(j), res_dir) # fit model x_train, y_train = traj_data.add(x, y) # inference bar = Bar('Learning', max=experiment.N_INFERENCE) with profile("Inference", PROFILING): i2c.reset_metrics() for i in range(experiment.N_INFERENCE): i2c.learn_msgs() # eval policy policy.K, policy.k, _ = i2c.get_local_linear_policy() policy.sigk = np.zeros(policy.sigk.shape) x, y, z = env.run(policy) z_est = i2c.get_marginal_observed_trajectory() traj_eval_iter.eval(z, z_est) if i % experiment.N_ITERS_PER_PLOT == 0: # i2c.plot_traj(i, dir_name=res_dir, filename="iter_{}_{}".format(j, i)) i2c.plot_observed_traj(dir_name=res_dir, filename="iter_{}_{}".format(j, i)) i2c.plot_system_dynamics(dir_name=res_dir, filename="iter_{}_{}".format( j, i)) i2c.plot_uncertainty(dir_name=res_dir, filename="iter_{}_{}".format(j, i)) i2c.plot_controller(dir_name=res_dir, filename="{}_{}".format(j, i)) i2c.plot_cost(res_dir, j) i2c.plot_alphas(dir_name=res_dir, filename=j) i2c.plot_policy_entropy(dir_name=res_dir, filename=j) policy.K, policy.k, policy.sigk = i2c.get_local_linear_policy( ) s_est = i2c.get_marginal_trajectory() env.plot_sim(x, s_est, "{} {}".format(j, i), res_dir) i2c.plot_gap(res_dir, j) i2c.plot_em_cost(res_dir, j) traj_eval_iter.plot("over_iterations_{}".format(j), res_dir) i2c.save(res_dir, "{}_{}".format(j, i)) bar.next() bar.finish() i2c.plot_traj(i, dir_name=res_dir, filename="iter_{}_{}".format(j, i)) i2c.plot_cost_all(res_dir, j) i2c.plot_uncertainty(dir_name=res_dir, filename=j) i2c.plot_observed_traj(dir_name=res_dir, filename="iter_{}_{}".format(j, i)) i2c.plot_controller(dir_name=res_dir, filename=j) # update policy policy.K, policy.k, policy.sigk = i2c.get_local_linear_policy() z_est = i2c.get_marginal_observed_trajectory() x, y, z = env.run(policy) s_est = i2c.get_marginal_trajectory() env.plot_sim(x, s_est, "evaluation {}".format(j), res_dir) traj_eval.eval(z, z_est) traj_eval.plot("over_episodes", res_dir) traj_eval_iter.plot("over_iterations_{}".format(j), res_dir) i2c.plot_alphas(res_dir, "final") i2c.plot_cost(res_dir, "cost_final") x_final, _, _ = env.run(policy) s_est = i2c.get_marginal_trajectory() env.plot_sim(x_final, s_est, "Final", res_dir) # compare against pure feedforward policy.zero() policy.k = i2c.get_marginal_input() x_ff, _, _ = env.run(policy) env.plot_sim(x_ff, s_est, "Final Feedforward", res_dir) # save model and data i2c.sys.save(res_dir) save_trajectories(x_final, i2c, res_dir) traj_eval.save("episodic", res_dir) traj_eval_iter.save("iter", res_dir) i2c.save(res_dir, "{}_{}".format(j, i)) env.close()
def ilqr_pendulum(res_dir): experiment = pendulum_known env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) policy = TimeIndexedLinearGaussianPolicy( 0 * np.eye(model.dim_u), experiment.N_DURATION, model.dim_u, model.dim_x) alpha = 1e4 Q = experiment.INFERENCE.Q / alpha R = experiment.INFERENCE.R / alpha xg = model.xag.squeeze() def cost(x, u, a, x_lin): _J = pendulum_dobs(x_lin) _j = pendulum_obs(x_lin) - _J @ x_lin _x = _J @ x + _j return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u ilqr = IterativeLqr( model, cost, experiment.N_DURATION, 2, alphas=np.power(10., np.linspace(0, -10, 50)), # mult_lmbda=1.6, # original matlab heuristic # mult_lmbda=1.001, good but needs 120 to hit 1.7 mult_lmbda=1.002, # gets to 1.7 by 80 tolgrad=1e-14, tolfun=1e-14, activation='all') # cost, cost_iter, full_cost = ilqr.run(200) cost, cost_real = ilqr.run( 100 ) x, u, c = ilqr.forward_pass(ilqr.ctl, 0.0) print(x.shape, u.shape) xu = np.vstack((x[:,:-1], u)).T env.plot_sim(xu, None, "ilqr pendulum internal", res_dir) cost = [c * alpha for c in cost] cost_real = [c * alpha for c in cost_real] policy.K = np.zeros(policy.K.shape) policy.k = ilqr.uref.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "ilqr pendulum feedforward", None) policy.K = ilqr.ctl.K.reshape(policy.K.shape) # policy.k = ilqr.ctl.kff.reshape(policy.k.shape) policy.k = np.copy(ilqr.uref.reshape(policy.k.shape)) Kx = np.einsum("jki,ki->ji", ilqr.ctl.K, ilqr.xref[:,:-1]) policy.k += -Kx.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "ilqr pendulum", None) x = xu[:, :model.dim_x] u = xu[:, model.dim_x:] ilqr.plot_trajectory() f,a = plt.subplots(2) a[0].plot(cost, '.-') a[1].plot(cost_real, '.-') plt.savefig(os.path.join(res_dir, "cost_ilqr_pendulum.png"), bbox_inches='tight', format='png') x = ilqr.xref[:,:-1].T u = ilqr.uref.T return x, u, cost, ilqr
def gps_pendulum(res_dir): experiment = pendulum_known env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) policy = TimeIndexedLinearGaussianPolicy( 0 * np.eye(model.dim_u), experiment.N_DURATION, model.dim_u, model.dim_x) alpha = 1e4 Q = experiment.INFERENCE.Q / alpha R = experiment.INFERENCE.R / alpha xg = model.xag.squeeze() def cost(x, u, a, x_lin): _J = pendulum_dobs(x_lin) _j = pendulum_obs(x_lin) - _J @ x_lin _x = _J @ x + _j return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u # good values 100 iter 0.07 gps = GuidedPolicySearch( model, cost, experiment.N_DURATION, kl_bound=0.07, u_lim=2, init_ctl_sigma=2.0, init_noise=1e-2, activation='all') cost = gps.run( 100, ) x, u, c = gps.forward_pass(gps.ctl) xu = np.vstack((x.mu[:,:-1], u.mu)).T env.plot_sim(xu, None, "gps pendulum internal", res_dir) cost = [c * alpha for c in cost] policy.K = np.zeros(policy.K.shape) policy.k = gps.udist.mu.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "gps pendulum feedforward", None) policy.K = gps.ctl.K.reshape(policy.K.shape) policy.k = gps.ctl.kff.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "gps pendulum", None) x = xu[:, :model.dim_x] u = xu[:, model.dim_x:] gps.plot_trajectory() f,a = plt.subplots(2) a[0].plot(cost, '.-') plt.savefig(os.path.join(res_dir, "cost_gps_pendulum.png"), bbox_inches='tight', format='png') x = gps.xdist.mu[:,:-1].T u = gps.udist.mu.T return x, u, cost, gps
def ilqr_double_cartpole(res_dir): experiment = double_cartpole_known env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) policy = TimeIndexedLinearGaussianPolicy( 0 * np.eye(model.dim_u), experiment.N_DURATION, model.dim_u, model.dim_x) alpha = 1e3 Q = experiment.INFERENCE.Q / alpha R = experiment.INFERENCE.R / alpha xg = model.xag.squeeze() def cost(x, u, a, x_lin): _J = double_cartpole_dobs(x_lin) _j = double_cartpole_obs(x_lin) - _J @ x_lin _x = _J @ x + _j return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u ilqr = IterativeLqr( model, cost, experiment.N_DURATION, 1e9, mult_lmbda=1.001, init_noise=1e-2, alphas=np.power(10., np.linspace(0, -8, 50)), tolgrad=1e-7, tolfun=1e-7, activation='all') cost, cost_real = ilqr.run( 200 ) x, u, c = ilqr.forward_pass(ilqr.ctl, 0.0) xu = np.vstack((x[:,:-1], u)).T env.plot_sim(xu, None, "ilqr double cartpole internal", res_dir) cost = [c * alpha for c in cost] cost_real = [c * alpha for c in cost_real] policy.K = np.zeros(policy.K.shape) policy.k = ilqr.uref.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "ilqr double cartpole feedforward", res_dir) policy.K = ilqr.ctl.K.reshape(policy.K.shape) policy.k = np.copy(ilqr.uref.reshape(policy.k.shape)) Kx = np.einsum("jki,ki->ji", ilqr.ctl.K, ilqr.xref[:,:-1]) policy.k += -Kx.reshape(policy.k.shape) xu, dx, z = env.run(policy) env.plot_sim(xu, None, "ilqr double cartpole", res_dir) x = xu[:, :model.dim_x] u = xu[:, model.dim_x:] ilqr.plot_trajectory() f,a = plt.subplots(2) a[0].plot(cost, '.-') a[1].plot(cost_real, '.-') plt.savefig(os.path.join(res_dir, "cost_ilqr_double_cartpole.png"), bbox_inches='tight', format='png') x = ilqr.xref[:,:-1].T u = ilqr.uref.T return x, u, cost, ilqr
def main(): res_dir = "_linear" if not os.path.exists(res_dir): os.makedirs(res_dir) env = make_env(experiment) model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL) experiment.N_INFERENCE = 1 # 100 model.xg = 10 * np.ones((env.dim_x, 1)) model.xag = model.xg model.a = model.xg - model.A.dot(model.xg) env.a = model.a.squeeze() # aaaaah policy_lqr = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR, experiment.N_DURATION, model.dim_u, model.dim_x) policy_i2c = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR, experiment.N_DURATION, model.dim_u, model.dim_x) ug = np.zeros((env.dim_u, )) x_lqr, u_lqr, K_lqr, k_lqr, cost_lqr, P, p = finite_horizon_lqr( experiment.N_DURATION, model.A, model.a.squeeze(), model.B, experiment.INFERENCE.Q, experiment.INFERENCE.R, model.x0, model.xg.squeeze(), ug, model.dim_x, model.dim_u) policy_lqr.K, policy_lqr.k = K_lqr, k_lqr x, y, z = env.run(policy_lqr) env.plot_sim(x, None, "LQR actual") i2c = I2cGraph( model, experiment.N_DURATION, experiment.INFERENCE.Q, experiment.INFERENCE.R, 1e-5, # big alpha so control cost dominates experiment.INFERENCE.alpha_update_tol, experiment.INFERENCE.SIG_U, experiment.INFERENCE.msg_iter, experiment.INFERENCE.msg_tol, experiment.INFERENCE.em_tol, experiment.INFERENCE.backwards_contraction, None) traj_eval = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg) # getting policy runs messages policy_i2c.K, policy_i2c.k, _ = i2c.get_local_linear_policy() i2c.plot_traj("State-action Trajectories", lqr_compare=True, dir_name=res_dir) plot_controller(i2c, dir_name=res_dir) s_est = i2c.get_marginal_trajectory() x, y, z = env.run(policy_i2c) z_est = i2c.get_marginal_observed_trajectory() traj_eval.eval(z, z_est) lam_x3 = np.asarray([c.lambda_x3_b for c in i2c.cells]).squeeze() * i2c.alpha nu_x3 = np.asarray([c.nu_x3_b for c in i2c.cells]).squeeze() * i2c.alpha f, a = plt.subplots(2, 1) _P = P.reshape((-1, env.dim_x * 2)) _lam_x3 = lam_x3.reshape((-1, env.dim_x * 2)) for i in range(env.dim_x * 2): a[0].plot(_P[:, i], 'k+-', label="$\mathbold{{P}}$" if i == 0 else "_") a[0].plot(_lam_x3[:, i], 'rx',\ label="$\mathbold{{\Lambda}}_{{\overleftarrow{{x}}}}$" if i == 0 else "_") a[0].legend() a[0].set_title("Value Function parameters") a[0].set_ylabel("Quadratic Weights") a[1].set_ylabel("Linear Weights") a[1].set_xlabel("Timesteps") for i in range(env.dim_x): a[1].plot(p[:, i], 'k+-', label="$\mathbold{{p}}$" if i == 0 else "_") a[1].plot(-nu_x3[:, i], 'rx', label="$-\mathbold{{\\nu}}_{{\overleftarrow{{x}}}}$" if i == 0 else "_") a[1].legend() if res_dir is not None: plt.savefig(os.path.join(res_dir, "value.png"), bbox_inches='tight', format='png') matplotlib2tikz.save(os.path.join(res_dir, "value.tex")) plt.close(f)