Exemple #1
0
def gps_cartpole(res_dir):
    experiment = cartpole_known
    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT,
                           experiment.MODEL)
    policy = TimeIndexedLinearGaussianPolicy(
        0 * np.eye(model.dim_u),
        experiment.N_DURATION, model.dim_u, model.dim_x)
    alpha = 1e3
    Q = experiment.INFERENCE.Q / alpha
    R = experiment.INFERENCE.R / alpha
    xg = model.xag.squeeze()

    def cost(x, u, a, x_lin):
        _J = cartpole_dobs(x_lin)
        _j = cartpole_obs(x_lin) - _J @ x_lin
        _x = _J @ x + _j
        return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u

    gps = GuidedPolicySearch(
        model, cost, experiment.N_DURATION,
        kl_bound=1.0,
        u_lim=5,
        init_ctl_sigma=1.25,
        init_noise=1e-1,
        activation='all')

    cost = gps.run(
        200,
        )

    x, u, c = gps.forward_pass(gps.ctl)
    xu = np.vstack((x.mu[:,:-1], u.mu)).T
    env.plot_sim(xu, None, "gps cartpole internal", res_dir)

    cost = [c * alpha for c in cost]

    policy.K = np.zeros(policy.K.shape)
    policy.k = gps.udist.mu.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "gps cartpole feedforward", None)


    policy.K = gps.ctl.K.reshape(policy.K.shape)
    policy.k = gps.ctl.kff.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "gps cartpole", None)

    x = xu[:, :model.dim_x]
    u = xu[:, model.dim_x:]

    gps.plot_trajectory()

    f,a = plt.subplots(2)
    a[0].plot(cost, '.-')

    return x, u, cost, gps
Exemple #2
0
def run(experiment, res_dir, weight_path):

    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL)
    i2c = I2cGraph(model, experiment.N_DURATION, experiment.INFERENCE.Q,
                   experiment.INFERENCE.R, experiment.INFERENCE.ALPHA,
                   experiment.INFERENCE.alpha_update_tol,
                   experiment.INFERENCE.SIG_U, experiment.INFERENCE.msg_iter,
                   experiment.INFERENCE.msg_tol, experiment.INFERENCE.em_tol,
                   experiment.INFERENCE.backwards_contraction, res_dir)

    if weight_path is not None:
        print("Loading i2c model with {}".format(weight_path))
        i2c.sys.model.load(weight_path)

    policy = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR,
                                             experiment.N_DURATION,
                                             i2c.sys.dim_u, i2c.sys.dim_x)

    # collection of data
    traj_data = TrajectoryData(model.x_noise, model.y_noise, experiment.N_AUG)
    s_est = np.zeros((experiment.N_DURATION, i2c.sys.dim_xt))
    traj_eval = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg)
    traj_eval_iter = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg)

    # get stationary data for initial training data
    for _ in range(experiment.N_STARTING):
        x, y, _ = env.run(policy)
        traj_data.add(x, y)

    # MBRL training loop
    for j in range(experiment.N_EPISODE):
        print("Ep. {}".format(j))

        # run policy on env, get data
        with profile("Simulation", PROFILING):
            x, y, _ = env.run(policy)
            x_test, y_test, _ = env.run(policy)
            env.plot_sim(x, s_est, "training {}".format(j), res_dir)

        # fit model
        x_train, y_train = traj_data.add(x, y)

        # inference
        bar = Bar('Learning', max=experiment.N_INFERENCE)
        with profile("Inference", PROFILING):
            i2c.reset_metrics()
            for i in range(experiment.N_INFERENCE):
                i2c.learn_msgs()
                # eval policy
                policy.K, policy.k, _ = i2c.get_local_linear_policy()
                policy.sigk = np.zeros(policy.sigk.shape)
                x, y, z = env.run(policy)
                z_est = i2c.get_marginal_observed_trajectory()
                traj_eval_iter.eval(z, z_est)

                if i % experiment.N_ITERS_PER_PLOT == 0:  #
                    i2c.plot_traj(i,
                                  dir_name=res_dir,
                                  filename="iter_{}_{}".format(j, i))
                    i2c.plot_observed_traj(dir_name=res_dir,
                                           filename="iter_{}_{}".format(j, i))
                    i2c.plot_system_dynamics(dir_name=res_dir,
                                             filename="iter_{}_{}".format(
                                                 j, i))
                    i2c.plot_uncertainty(dir_name=res_dir,
                                         filename="iter_{}_{}".format(j, i))
                    i2c.plot_controller(dir_name=res_dir,
                                        filename="{}_{}".format(j, i))
                    i2c.plot_cost(res_dir, j)
                    i2c.plot_alphas(dir_name=res_dir, filename=j)
                    i2c.plot_policy_entropy(dir_name=res_dir, filename=j)
                    policy.K, policy.k, policy.sigk = i2c.get_local_linear_policy(
                    )
                    s_est = i2c.get_marginal_trajectory()
                    env.plot_sim(x, s_est, "{} {}".format(j, i), res_dir)
                    i2c.plot_gap(res_dir, j)
                    i2c.plot_em_cost(res_dir, j)
                    traj_eval_iter.plot("over_iterations_{}".format(j),
                                        res_dir)
                    i2c.save(res_dir, "{}_{}".format(j, i))
                bar.next()
            bar.finish()

        i2c.plot_traj(i, dir_name=res_dir, filename="iter_{}_{}".format(j, i))
        i2c.plot_cost_all(res_dir, j)
        i2c.plot_uncertainty(dir_name=res_dir, filename=j)
        i2c.plot_observed_traj(dir_name=res_dir,
                               filename="iter_{}_{}".format(j, i))
        i2c.plot_controller(dir_name=res_dir, filename=j)

        # update policy
        policy.K, policy.k, policy.sigk = i2c.get_local_linear_policy()
        z_est = i2c.get_marginal_observed_trajectory()
        x, y, z = env.run(policy)
        s_est = i2c.get_marginal_trajectory()
        env.plot_sim(x, s_est, "evaluation {}".format(j), res_dir)
        traj_eval.eval(z, z_est)
        traj_eval.plot("over_episodes", res_dir)
        traj_eval_iter.plot("over_iterations_{}".format(j), res_dir)

    i2c.plot_alphas(res_dir, "final")
    i2c.plot_cost(res_dir, "cost_final")

    x_final, _, _ = env.run(policy)
    s_est = i2c.get_marginal_trajectory()
    env.plot_sim(x_final, s_est, "Final", res_dir)

    # compare against pure feedforward
    policy.zero()
    policy.k = i2c.get_marginal_input()
    x_ff, _, _ = env.run(policy)
    env.plot_sim(x_ff, s_est, "Final Feedforward", res_dir)

    # save model and data
    i2c.sys.save(res_dir)
    save_trajectories(x_final, i2c, res_dir)
    traj_eval.save("episodic", res_dir)
    traj_eval_iter.save("iter", res_dir)
    i2c.save(res_dir, "{}_{}".format(j, i))

    env.close()
Exemple #3
0
def ilqr_pendulum(res_dir):
    experiment = pendulum_known
    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT,
                           experiment.MODEL)
    policy = TimeIndexedLinearGaussianPolicy(
        0 * np.eye(model.dim_u),
        experiment.N_DURATION, model.dim_u, model.dim_x)
    alpha = 1e4
    Q = experiment.INFERENCE.Q / alpha
    R = experiment.INFERENCE.R / alpha
    xg = model.xag.squeeze()

    def cost(x, u, a, x_lin):
        _J = pendulum_dobs(x_lin)
        _j = pendulum_obs(x_lin) - _J @ x_lin
        _x = _J @ x + _j
        return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u

    ilqr = IterativeLqr(
        model, cost, experiment.N_DURATION, 2,
        alphas=np.power(10., np.linspace(0, -10, 50)),
        # mult_lmbda=1.6, # original matlab heuristic
        # mult_lmbda=1.001, good but needs 120 to hit 1.7
        mult_lmbda=1.002, # gets to 1.7 by 80
        tolgrad=1e-14,
        tolfun=1e-14,
        activation='all')
    # cost, cost_iter, full_cost = ilqr.run(200)
    cost, cost_real = ilqr.run(
        100
        )

    x, u, c = ilqr.forward_pass(ilqr.ctl, 0.0)
    print(x.shape, u.shape)
    xu = np.vstack((x[:,:-1], u)).T
    env.plot_sim(xu, None, "ilqr pendulum internal", res_dir)

    cost = [c * alpha for c in cost]
    cost_real = [c * alpha for c in cost_real]

    policy.K = np.zeros(policy.K.shape)
    policy.k = ilqr.uref.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "ilqr pendulum feedforward", None)


    policy.K = ilqr.ctl.K.reshape(policy.K.shape)
    # policy.k = ilqr.ctl.kff.reshape(policy.k.shape)
    policy.k = np.copy(ilqr.uref.reshape(policy.k.shape))
    Kx = np.einsum("jki,ki->ji", ilqr.ctl.K, ilqr.xref[:,:-1])
    policy.k += -Kx.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "ilqr pendulum", None)

    x = xu[:, :model.dim_x]
    u = xu[:, model.dim_x:]

    ilqr.plot_trajectory()

    f,a = plt.subplots(2)
    a[0].plot(cost, '.-')
    a[1].plot(cost_real, '.-')
    plt.savefig(os.path.join(res_dir, "cost_ilqr_pendulum.png"),
                bbox_inches='tight', format='png')

    x = ilqr.xref[:,:-1].T
    u = ilqr.uref.T

    return x, u, cost, ilqr
Exemple #4
0
def gps_pendulum(res_dir):
    experiment = pendulum_known
    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT,
                           experiment.MODEL)
    policy = TimeIndexedLinearGaussianPolicy(
        0 * np.eye(model.dim_u),
        experiment.N_DURATION, model.dim_u, model.dim_x)
    alpha = 1e4
    Q = experiment.INFERENCE.Q / alpha
    R = experiment.INFERENCE.R / alpha
    xg = model.xag.squeeze()

    def cost(x, u, a, x_lin):
            _J = pendulum_dobs(x_lin)
            _j = pendulum_obs(x_lin) - _J @ x_lin
            _x = _J @ x + _j
            return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u

    # good values 100 iter 0.07
    gps = GuidedPolicySearch(
        model, cost, experiment.N_DURATION,
        kl_bound=0.07,
        u_lim=2,
        init_ctl_sigma=2.0,
        init_noise=1e-2,
        activation='all')

    cost = gps.run(
        100,
        )

    x, u, c = gps.forward_pass(gps.ctl)

    xu = np.vstack((x.mu[:,:-1], u.mu)).T
    env.plot_sim(xu, None, "gps pendulum internal", res_dir)

    cost = [c * alpha for c in cost]

    policy.K = np.zeros(policy.K.shape)
    policy.k = gps.udist.mu.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "gps pendulum feedforward", None)


    policy.K = gps.ctl.K.reshape(policy.K.shape)

    policy.k = gps.ctl.kff.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "gps pendulum", None)

    x = xu[:, :model.dim_x]
    u = xu[:, model.dim_x:]

    gps.plot_trajectory()

    f,a = plt.subplots(2)
    a[0].plot(cost, '.-')
    plt.savefig(os.path.join(res_dir, "cost_gps_pendulum.png"),
                bbox_inches='tight', format='png')

    x = gps.xdist.mu[:,:-1].T
    u = gps.udist.mu.T

    return x, u, cost, gps
Exemple #5
0
def ilqr_double_cartpole(res_dir):
    experiment = double_cartpole_known
    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT,
                           experiment.MODEL)
    policy = TimeIndexedLinearGaussianPolicy(
        0 * np.eye(model.dim_u),
        experiment.N_DURATION, model.dim_u, model.dim_x)
    alpha = 1e3
    Q = experiment.INFERENCE.Q / alpha
    R = experiment.INFERENCE.R / alpha
    xg = model.xag.squeeze()

    def cost(x, u, a, x_lin):
        _J = double_cartpole_dobs(x_lin)
        _j = double_cartpole_obs(x_lin) - _J @ x_lin
        _x = _J @ x + _j
        return (_x - xg).T @ Q @ (_x - xg) + u.T @ R @ u

    ilqr = IterativeLqr(
        model, cost, experiment.N_DURATION, 1e9,
        mult_lmbda=1.001,
        init_noise=1e-2,
        alphas=np.power(10., np.linspace(0, -8, 50)),
        tolgrad=1e-7,
        tolfun=1e-7,
        activation='all')
    cost, cost_real = ilqr.run(
        200
        )

    x, u, c = ilqr.forward_pass(ilqr.ctl, 0.0)
    xu = np.vstack((x[:,:-1], u)).T
    env.plot_sim(xu, None, "ilqr double cartpole internal", res_dir)

    cost = [c * alpha for c in cost]
    cost_real = [c * alpha for c in cost_real]

    policy.K = np.zeros(policy.K.shape)
    policy.k = ilqr.uref.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "ilqr double cartpole feedforward", res_dir)


    policy.K = ilqr.ctl.K.reshape(policy.K.shape)

    policy.k = np.copy(ilqr.uref.reshape(policy.k.shape))
    Kx = np.einsum("jki,ki->ji", ilqr.ctl.K, ilqr.xref[:,:-1])
    policy.k += -Kx.reshape(policy.k.shape)
    xu, dx, z = env.run(policy)
    env.plot_sim(xu, None, "ilqr double cartpole", res_dir)

    x = xu[:, :model.dim_x]
    u = xu[:, model.dim_x:]

    ilqr.plot_trajectory()

    f,a = plt.subplots(2)
    a[0].plot(cost, '.-')
    a[1].plot(cost_real, '.-')
    plt.savefig(os.path.join(res_dir, "cost_ilqr_double_cartpole.png"),
                bbox_inches='tight', format='png')

    x = ilqr.xref[:,:-1].T
    u = ilqr.uref.T

    return x, u, cost, ilqr
def main():

    res_dir = "_linear"
    if not os.path.exists(res_dir):
        os.makedirs(res_dir)

    env = make_env(experiment)
    model = make_env_model(experiment.ENVIRONMENT, experiment.MODEL)

    experiment.N_INFERENCE = 1  # 100
    model.xg = 10 * np.ones((env.dim_x, 1))
    model.xag = model.xg
    model.a = model.xg - model.A.dot(model.xg)
    env.a = model.a.squeeze()  # aaaaah

    policy_lqr = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR,
                                                 experiment.N_DURATION,
                                                 model.dim_u, model.dim_x)
    policy_i2c = TimeIndexedLinearGaussianPolicy(experiment.POLICY_COVAR,
                                                 experiment.N_DURATION,
                                                 model.dim_u, model.dim_x)

    ug = np.zeros((env.dim_u, ))
    x_lqr, u_lqr, K_lqr, k_lqr, cost_lqr, P, p = finite_horizon_lqr(
        experiment.N_DURATION, model.A, model.a.squeeze(), model.B,
        experiment.INFERENCE.Q, experiment.INFERENCE.R, model.x0,
        model.xg.squeeze(), ug, model.dim_x, model.dim_u)

    policy_lqr.K, policy_lqr.k = K_lqr, k_lqr
    x, y, z = env.run(policy_lqr)
    env.plot_sim(x, None, "LQR actual")

    i2c = I2cGraph(
        model,
        experiment.N_DURATION,
        experiment.INFERENCE.Q,
        experiment.INFERENCE.R,
        1e-5,  # big alpha so control cost dominates
        experiment.INFERENCE.alpha_update_tol,
        experiment.INFERENCE.SIG_U,
        experiment.INFERENCE.msg_iter,
        experiment.INFERENCE.msg_tol,
        experiment.INFERENCE.em_tol,
        experiment.INFERENCE.backwards_contraction,
        None)

    traj_eval = TrajectoryEvaluator(experiment.N_DURATION, i2c.QR, i2c.sg)

    # getting policy runs messages
    policy_i2c.K, policy_i2c.k, _ = i2c.get_local_linear_policy()

    i2c.plot_traj("State-action Trajectories",
                  lqr_compare=True,
                  dir_name=res_dir)

    plot_controller(i2c, dir_name=res_dir)

    s_est = i2c.get_marginal_trajectory()
    x, y, z = env.run(policy_i2c)
    z_est = i2c.get_marginal_observed_trajectory()
    traj_eval.eval(z, z_est)

    lam_x3 = np.asarray([c.lambda_x3_b
                         for c in i2c.cells]).squeeze() * i2c.alpha
    nu_x3 = np.asarray([c.nu_x3_b for c in i2c.cells]).squeeze() * i2c.alpha

    f, a = plt.subplots(2, 1)
    _P = P.reshape((-1, env.dim_x * 2))
    _lam_x3 = lam_x3.reshape((-1, env.dim_x * 2))
    for i in range(env.dim_x * 2):
        a[0].plot(_P[:, i], 'k+-', label="$\mathbold{{P}}$" if i == 0 else "_")
        a[0].plot(_lam_x3[:, i], 'rx',\
            label="$\mathbold{{\Lambda}}_{{\overleftarrow{{x}}}}$" if i == 0 else "_")

    a[0].legend()
    a[0].set_title("Value Function parameters")
    a[0].set_ylabel("Quadratic Weights")
    a[1].set_ylabel("Linear Weights")
    a[1].set_xlabel("Timesteps")
    for i in range(env.dim_x):
        a[1].plot(p[:, i], 'k+-', label="$\mathbold{{p}}$" if i == 0 else "_")
        a[1].plot(-nu_x3[:, i],
                  'rx',
                  label="$-\mathbold{{\\nu}}_{{\overleftarrow{{x}}}}$"
                  if i == 0 else "_")
    a[1].legend()
    if res_dir is not None:
        plt.savefig(os.path.join(res_dir, "value.png"),
                    bbox_inches='tight',
                    format='png')
        matplotlib2tikz.save(os.path.join(res_dir, "value.tex"))
        plt.close(f)