Ejemplo n.º 1
0
def backward_pass(env, X, U, tN=50, lamb=1):   
    # Value function at final timestep is known
    l, l_x, l_xx = cost_final(env, X[:, -1])
    V_x = l_x.copy()
    V_xx = l_xx.copy()
    # Allocate space for feedforward and feeback term
    k = np.zeros((env.action_space.shape[0], tN))
    K = np.zeros((env.action_space.shape[0], env.observation_space.shape[0], tN))
    # Run a backwards pass from N-1 control step
    for i in range(tN-2,-1,-1):
        # pdb.set_trace()
        df_dx = approximate_A(env, X[:, i].copy(), U[:, i].copy(), simulate_dynamics_next)
        df_du = approximate_B(env, X[:, i].copy(), U[:, i].copy(), simulate_dynamics_next)
        l, l_x, l_xx, l_u, l_uu, l_ux = cost_inter(env, X[:, i].copy(), U[:, i].copy()) 

        Q_x = l_x + df_dx.T @ V_x
        Q_u = l_u + df_du.T @ V_x
        Q_xx = l_xx + df_dx.T @ V_xx @ df_dx 
        Q_ux = l_ux + df_du.T @ V_xx @ df_dx
        Q_uu = l_uu + df_du.T @ V_xx @ df_du
        # Q_uu_inv = np.linalg.pinv(Q_uu) 
        
        Q_uu_evals, Q_uu_evecs = np.linalg.eig(Q_uu)
        Q_uu_evals[Q_uu_evals < 0] = 0.0
        Q_uu_evals += lamb
        Q_uu_inv = Q_uu_evecs @ np.diag(1.0/Q_uu_evals) @ Q_uu_evecs.T

        # Calculate feedforward and feedback terms
        k[:,i] = -Q_uu_inv @ Q_u
        K[:,:,i] = -Q_uu_inv @ Q_ux
        # Update value function for next time step
        V_x = (Q_x - K[:,:,i].T @ Q_uu @ k[:,i]).copy()
        V_xx = (Q_xx - K[:,:,i].T @ Q_uu @ K[:,:,i]).copy()
    
    return k, K
Ejemplo n.º 2
0
def ilqr_forward(U, X, d, n, sim_env, tN):
    """
    Forward pass of iLQR algorithm 
    
    Basically we are to create a list of cost, f(x) and their derivations for all time steps 
    
    Parameters
    ----------
    U: control sequences
    X: state sequences stemmed from control sequences 
    d: degree of freedom (similar in env and sim_env) 
    env: main environment 
    n: number of dimension in states 
    sim_env: simulate env which is used to approximate A and B to calculate f(x_t+1), df/dx, df/du 
    tN: number of time steps 
    
    Returns
    -------
    f_u_s: list of df/du (for all time steps)  
    f_x_s: list of df/dx 
    l_u_s: list of costs  
    l_uu_s: list of d^2l/du^2 
    l_ux_s: list of d(dl/du)/dx 
    l_x_s: list of dl/dx
    l_xx_s: list of d^2l/dx^2 
    """
    # ===== calculate all derivations ====
    f_x_s = np.zeros((tN, n, n))
    f_u_s = np.zeros((tN, n, d))
    l_s = np.zeros((tN, 1))
    l_x_s = np.zeros((tN, n))
    l_xx_s = np.zeros((tN, n, n))
    l_u_s = np.zeros((tN, d))
    l_uu_s = np.zeros((tN, d, d))
    l_ux_s = np.zeros((tN, d, n))

    # populate cost and derivations
    for t in range(tN - 1):
        # we need A and B for derivation of f(x_t+1) = A*x_t + B*u_t
        A = ctl.approximate_A(sim_env, X[t], U[t])
        B = ctl.approximate_B(sim_env, X[t], U[t])
        f_x_s[t] = A * sim_env.dt + np.eye(n)  # magic trick
        f_u_s[t] = B * sim_env.dt

        # then for normal costs and their derivations
        l, l_x, l_xx, l_u, l_uu, l_ux = cost_inter(sim_env, X[t], U[t])
        l_s[t] = l * sim_env.dt
        l_x_s[t] = l_x * sim_env.dt
        l_xx_s[t] = l_xx * sim_env.dt
        l_u_s[t] = l_u * sim_env.dt
        l_uu_s[t] = l_uu * sim_env.dt
        l_ux_s[t] = l_ux * sim_env.dt

    # and separately for final state
    l_s[-1], l_x_s[-1], l_xx_s[-1] = cost_final(sim_env, X[tN - 1])

    return f_u_s, f_x_s, l_u_s, l_uu_s, l_ux_s, l_x_s, l_xx_s, l_s
Ejemplo n.º 3
0
def calc_ilqr_input(env, sim_env, tN=50, max_iter=1e6):
    """Calculate the optimal control input for the given state.


    Parameters
    ----------
    env: gym.core.Env
      This is the true environment you will execute the computed
      commands on. Use this environment to get the Q and R values as
      well as the state.
    sim_env: gym.core.Env
      A copy of the env class. Use this to simulate the dynamics when
      doing finite differences.
    tN: number of control steps you are going to execute
    max_itr: max iterations for optmization

    Returns
    -------
    U: np.array
      The SEQUENCE of commands to execute. The size should be (tN, #parameters)
    """
    x0 = env.state.copy()
    Q = env.Q
    R = env.R
    # U = np.array([env.action_space.sample() for _ in range(tN)])
    U = np.zeros((tN, 2))
    m = x0.shape[0]
    n = U[0].shape[0]
    dt = 1e-3
    cost = 0
    reg = np.eye(n) * 1.0
    costs = []

    for i in range(int(max_iter)):
        # Get state trajectory
        X = simulate(sim_env, x0, U)
        assert U.shape[0] == tN
        assert X.shape[0] == tN + 1

        # Initialize placeholders
        l = np.zeros((tN + 1, ))
        l_x = np.zeros((tN + 1, m))
        l_xx = np.zeros((tN + 1, m, m))
        l_u = np.zeros((tN, n))
        l_uu = np.zeros((tN, n, n))
        l_ux = np.zeros((tN, n, m))
        f_x = np.zeros((tN, m, m))
        f_u = np.zeros((tN, m, n))
        V_x = np.zeros((tN + 1, m))
        V_xx = np.zeros((tN + 1, m, m))
        k = np.zeros((tN, n))
        K = np.zeros((tN, n, m))

        # Calculate all costs and partial derivatives
        for t in range(tN):
            x, u = X[t], U[t]

            l[t], l_x[t, :], l_xx[t, :], l_u[t, :], l_uu[t, :, :], l_ux[
                t, :, :] = cost_inter(sim_env, x, u)

            # Approximate xdot(t) = A x(t) + B u(t), and x(t+1) = x(t) + xdot(t) * dt
            # So later x(t+1) = x(t) + (A x(t) + B u(t)) * dt
            A = approximate_A(sim_env, x, u)
            B = approximate_B(sim_env, x, u)

            # Dynamics is x(t+1) = f(x(t), u(t))
            # Partial derivatives of f wrt x = I + A * dt
            f_x[t, :, :] = np.eye(m) + A * dt
            # Partial derivatives of f wrt x = 0 + B * dt
            f_u[t, :, :] = B * dt

        l *= dt
        l_x *= dt
        l_xx *= dt
        l_u *= dt
        l_uu *= dt
        l_ux *= dt
        l[tN], l_x[tN, :], l_xx[tN, :, :] = cost_final(sim_env, X[-1])

        # Check for early convergence
        # ===========================
        curr_cost = l.sum()
        costs.append(curr_cost)
        if cost != 0:
            diff_perc = np.abs((curr_cost - cost) / cost)
            # print(f"Iter ({i}): Old Cost: {cost:.2f} Curr Cost: {curr_cost:.2f} Diff Perc: {diff_perc:.4f}")
            if diff_perc < 1e-3:
                print(f"Exiting early at iteration {i}")
                return U, costs
        cost = curr_cost

        # Start Dynamic Programming for Backpass
        # ======================================

        # Initial values from the back
        V_x[tN, :] = l_x[tN, :].copy()
        V_xx[tN, :, :] = l_xx[tN, :, :].copy()

        for t in reversed(range(tN)):
            Q_x = l_x[t] + f_x[t].T @ V_x[t + 1]
            Q_u = l_u[t] + f_u[t].T @ V_x[t + 1]
            Q_xx = l_xx[t] + f_x[t].T @ V_xx[t + 1] @ f_x[t]
            Q_ux = l_ux[t] + f_u[t].T @ V_xx[t + 1] @ f_x[t]
            Q_uu = l_uu[t] + f_u[t].T @ V_xx[t + 1] @ f_u[t]

            # Safe inverse with regularization
            Q_uu_inv = pinv(Q_uu + reg)
            k[t, :] = -Q_uu_inv @ Q_u
            K[t, :, :] = -Q_uu_inv @ Q_ux

            # Current gradients for value function for prev timestep
            V_x[t] = Q_x - K[t].T @ Q_uu @ k[t]
            V_xx[t] = Q_xx - K[t].T @ Q_uu @ K[t]

        # Forward Pass
        # ============
        updated_U = np.zeros_like(U)
        updated_X = np.zeros_like(X)
        updated_X[0, :] = x0.copy()

        for t in range(tN):
            new_x = updated_X[t]
            new_u = U[t] + K[t] @ (new_x - X[t]) + k[t]
            next_x = simulate_dynamics_next(sim_env, new_x, new_u)

            updated_U[t, :] = new_u
            updated_X[t + 1, :] = next_x

        X = updated_X.copy()
        U = updated_U.copy()
        final_l = l.copy()

    return U, costs
Ejemplo n.º 4
0
def calc_ilqr_input(env, sim_env, tN=100, max_iter=1e5):
    """Calculate the optimal control input for the given state.


    Parameters
    ----------
    env: gym.core.Env
      This is the true environment you will execute the computed
      commands on. Use this environment to get the Q and R values as
      well as the state.
    sim_env: gym.core.Env
      A copy of the env class. Use this to simulate the dynamics when
      doing finite differences.
    tN: number of control steps you are going to execute
    max_itr: max iterations for optmization

    Returns
    -------
    U: np.array
      The SEQUENCE of commands to execute. The size should be (tN, #parameters)
    """
    x0 = env.state
    sim_env.state = x0.copy()

    dof = 2
    num_states = 4
    cost_list = []

    U = np.zeros((tN - 1, dof))
    # U = np.random.uniform(-10, 10, (tN - 1, dof))

    X, cost = simulate(sim_env, x0, U)
    cost_list.append(cost)
    for ii in range(int(max_iter)):

        f_x = np.zeros((tN - 1, num_states, num_states))
        f_u = np.zeros((tN - 1, num_states, dof))
        l = np.zeros((tN, 1))
        l_x = np.zeros((tN, num_states))
        l_xx = np.zeros((tN, num_states, num_states))
        l_u = np.zeros((tN, dof))
        l_uu = np.zeros((tN, dof, dof))
        l_ux = np.zeros((tN, dof, num_states))

        for t in range(tN - 1):
            A = approximate_A(sim_env, X[t], U[t])
            B = approximate_B(sim_env, X[t], U[t])
            # A = approximate_A_discrete(sim_env, X[t], U[t])
            # B = approximate_B_discrete(sim_env, X[t], U[t])
            f_x[t] = np.eye(num_states) + A * dt
            f_u[t] = B * dt

            l[t], l_x[t], l_xx[t], l_u[t], l_uu[t], l_ux[t] = cost_inter(
                sim_env, X[t], U[t])
            l[t] *= dt
            l_x[t] *= dt
            l_xx[t] *= dt
            l_u[t] *= dt
            l_uu[t] *= dt
            l_ux[t] *= dt
        l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, X[-1])
        l[-1] *= dt
        l_x[-1] *= dt
        l_xx[-1] *= dt

        V_x = l_x[-1].copy()  # dV / dx
        V_xx = l_xx[-1].copy()  # d^2 V / dx^2
        k = np.zeros((tN - 1, dof))
        K = np.zeros((tN - 1, dof, num_states))

        for t in range(tN - 2, -1, -1):

            Q_x = l_x[t] + np.dot(f_x[t].T, V_x)
            Q_u = l_u[t] + np.dot(f_u[t].T, V_x)

            Q_xx = l_xx[t] + np.dot(f_x[t].T, np.dot(V_xx, f_x[t]))
            Q_ux = l_ux[t] + np.dot(f_u[t].T, np.dot(V_xx, f_x[t]))
            Q_uu = l_uu[t] + np.dot(f_u[t].T, np.dot(V_xx, f_u[t]))

            Q_uu_inv = np.linalg.pinv(Q_uu + 1e-2 * np.eye(2))
            # Q_uu_inv = inv_stable(Q_uu)

            k[t] = -np.dot(Q_uu_inv, Q_u)
            K[t] = -np.dot(Q_uu_inv, Q_ux)

            V_x = Q_x - np.dot(K[t].T, np.dot(Q_uu, k[t]))
            V_xx = Q_xx - np.dot(K[t].T, np.dot(Q_uu, K[t]))
        U_ = np.zeros((tN - 1, dof))
        x_ = x0.copy()  # 7a)

        for t in range(tN - 1):
            U_[t] = U[t] + k[t] + np.dot(K[t], x_ - X[t])
            x_ = simulate_dynamics_next(sim_env, x_, U_[t])

        Xnew, costnew = simulate(sim_env, x0, U_)
        # print(cost)
        # print(costnew)
        cost_list.append(costnew)

        X = np.copy(Xnew)
        U = np.copy(U_)
        oldcost = np.copy(cost)
        cost = np.copy(costnew)

        if abs(oldcost - cost) < EPS_CONVERGE:
            break
    # print(cost)
    return U, cost, cost_list
Ejemplo n.º 5
0
def calc_ilqr_input(env, sim_env,x0, u_seq, tN=50, max_iter=1e6):
    """Calculate the optimal control input for the given state.


    Parameters
    ----------
    env: gym.core.Env
      This is the true environment you will execute the computed
      commands on. Use this environment to get the Q and R values as
      well as the state.
    sim_env: gym.core.Env
      A copy of the env class. Use this to simulate the dynamics when
      doing finite differences.
    tN: number of control steps you are going to execute
    max_itr: max iterations for optmization

    Returns
    -------
    U: np.array
      The SEQUENCE of commands to execute. The size should be (tN, #parameters)
    """
    
    state_dim = x0.shape[0]
    action_dim = u_seq.shape[1]
    
    lamb = 1.0
    lamb_fac = 2.0
    lamb_max = 1000

    iter_num = 0
    gen_new_traj = True 


    while(iter_num < max_iter):
    
        if gen_new_traj:
            pdb.set_trace()
            x_seq, cost, reward = simulate(sim_env, x0, u_seq)
            old_cost = cost * 1.0
            env.state = x0 * 1.0
            
            l, l_x, l_xx, l_u, l_uu, l_ux, f_x, f_u = reset_cost_dynamics(tN,state_dim, action_dim)

            l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, x_seq[-1])
            
            for i in range(tN-1, -1, -1):
                x = x_seq[i] * 1.0
                u = u_seq[i] * 1.0

                l[i], l_x[i], l_xx[i], l_u[i], l_uu[i], l_ux[i] = cost_inter(sim_env, x, u)
                 
                A = approximate_A(sim_env, simulate_dynamics_next, x, u)
                B = approximate_B(sim_env, simulate_dynamics_next, x, u)

                f_x[i] = A * 1.0
                f_u[i] = B * 1.0

                gen_new_traj = False
            
            pdb.set_trace()
            k = np.zeros((tN, action_dim))
            K = np.zeros((tN, action_dim, state_dim))

            k,K = backward_recursion(k, K, l, l_x, l_xx, l_u. l_uu, l_ux, f_x, f_u , lamb)

    return np.zeros((50, 2))
Ejemplo n.º 6
0
def calc_ilqr_input(env, sim_env, x0, u_seq, tN=50, max_iter=1e6):
    """Calculate the optimal control input for the given state.


    Parameters
    ----------
    env: gym.core.Env
      This is the true environment you will execute the computed
      commands on. Use this environment to get the Q and R values as
      well as the state.
    sim_env: gym.core.Env
      A copy of the env class. Use this to simulate the dynamics when
      doing finite differences.
    tN: number of control steps you are going to execute
    max_itr: max iterations for optmization

    Returns
    -------
    U: np.array
      The SEQUENCE of commands to execute. The size should be (tN, #parameters)
    """

    state_dim = x0.shape[0]
    action_dim = u_seq.shape[1]

    lamb = 1.0
    lamb_fac = 2.0
    lmab_max = 1000
    alpha = 1.0

    iter_num = 0

    gen_new_traj = True
    total_cost = []
    while iter_num < max_iter:
        # print("iter: {}".format(iter_num))

        if gen_new_traj:

            x_seq, cost = simulate(sim_env, x0, u_seq)
            old_cost = cost * 1.0
            env.state = x0 * 1.0

            l, l_x, l_xx, l_u, l_uu, l_ux, f_x, f_u = reset_cost_dynamics(
                tN, state_dim, action_dim)

            l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, x_seq[-1])

            for i in range(tN - 1, -1, -1):
                x = x_seq[i] * 1.0
                u = u_seq[i] * 1.0

                l[i], l_x[i], l_xx[i], l_u[i], l_uu[i], l_ux[i] = cost_inter(
                    sim_env, x, u)

                A = approximate_A(sim_env, simulate_dynamics_next, x, u)
                B = approximate_B(sim_env, simulate_dynamics_next, x, u)

                f_x[i] = A * 1.0
                f_u[i] = B * 1.0
                gen_new_traj = False

        k = np.zeros((tN, action_dim))
        K = np.zeros((tN, action_dim, state_dim))

        # k: open loop term; K: feedback gain term
        k, K = backward_recursion(k, K, l, l_x, l_xx, l_u, l_uu, l_ux, f_x,
                                  f_u, lamb)

        u_seq_new = np.zeros((tN, action_dim))

        x_new = x_seq[0] * 1.0
        sim_env.state = x_new * 1.0

        for i in range(0, tN - 1):
            u_seq_new[i] = u_seq[i] + alpha * k[i] + np.matmul(
                K[i], x_new - x_seq[i])
            x_new, _, _, _ = sim_env.step(u_seq_new[i])

        x_seq_new, new_cost = simulate(env, x0, u_seq_new)

        total_cost.append(new_cost * 1.0)
        if new_cost < cost:
            gen_new_traj = True
            lamb /= lamb_fac
            x_seq = x_seq_new * 1.0
            u_seq = u_seq_new * 1.0
            alpha = 1.0
            old_cost = cost * 1.0
            cost = new_cost * 1.0
            if (iter_num > 0 and abs(old_cost - cost) < 0.001 * cost):
                # print("new_cost: {}".format(cost))
                # cost = new_cost * 1.0
                break
            # cost = new_cost*1.0

        else:
            lamb *= lamb_fac
            if lamb > lmab_max:
                # print("cost: {}".format(cost))
                break

        iter_num += 1
    return u_seq, total_cost