def backward_pass(env, X, U, tN=50, lamb=1): # Value function at final timestep is known l, l_x, l_xx = cost_final(env, X[:, -1]) V_x = l_x.copy() V_xx = l_xx.copy() # Allocate space for feedforward and feeback term k = np.zeros((env.action_space.shape[0], tN)) K = np.zeros((env.action_space.shape[0], env.observation_space.shape[0], tN)) # Run a backwards pass from N-1 control step for i in range(tN-2,-1,-1): # pdb.set_trace() df_dx = approximate_A(env, X[:, i].copy(), U[:, i].copy(), simulate_dynamics_next) df_du = approximate_B(env, X[:, i].copy(), U[:, i].copy(), simulate_dynamics_next) l, l_x, l_xx, l_u, l_uu, l_ux = cost_inter(env, X[:, i].copy(), U[:, i].copy()) Q_x = l_x + df_dx.T @ V_x Q_u = l_u + df_du.T @ V_x Q_xx = l_xx + df_dx.T @ V_xx @ df_dx Q_ux = l_ux + df_du.T @ V_xx @ df_dx Q_uu = l_uu + df_du.T @ V_xx @ df_du # Q_uu_inv = np.linalg.pinv(Q_uu) Q_uu_evals, Q_uu_evecs = np.linalg.eig(Q_uu) Q_uu_evals[Q_uu_evals < 0] = 0.0 Q_uu_evals += lamb Q_uu_inv = Q_uu_evecs @ np.diag(1.0/Q_uu_evals) @ Q_uu_evecs.T # Calculate feedforward and feedback terms k[:,i] = -Q_uu_inv @ Q_u K[:,:,i] = -Q_uu_inv @ Q_ux # Update value function for next time step V_x = (Q_x - K[:,:,i].T @ Q_uu @ k[:,i]).copy() V_xx = (Q_xx - K[:,:,i].T @ Q_uu @ K[:,:,i]).copy() return k, K
def ilqr_forward(U, X, d, n, sim_env, tN): """ Forward pass of iLQR algorithm Basically we are to create a list of cost, f(x) and their derivations for all time steps Parameters ---------- U: control sequences X: state sequences stemmed from control sequences d: degree of freedom (similar in env and sim_env) env: main environment n: number of dimension in states sim_env: simulate env which is used to approximate A and B to calculate f(x_t+1), df/dx, df/du tN: number of time steps Returns ------- f_u_s: list of df/du (for all time steps) f_x_s: list of df/dx l_u_s: list of costs l_uu_s: list of d^2l/du^2 l_ux_s: list of d(dl/du)/dx l_x_s: list of dl/dx l_xx_s: list of d^2l/dx^2 """ # ===== calculate all derivations ==== f_x_s = np.zeros((tN, n, n)) f_u_s = np.zeros((tN, n, d)) l_s = np.zeros((tN, 1)) l_x_s = np.zeros((tN, n)) l_xx_s = np.zeros((tN, n, n)) l_u_s = np.zeros((tN, d)) l_uu_s = np.zeros((tN, d, d)) l_ux_s = np.zeros((tN, d, n)) # populate cost and derivations for t in range(tN - 1): # we need A and B for derivation of f(x_t+1) = A*x_t + B*u_t A = ctl.approximate_A(sim_env, X[t], U[t]) B = ctl.approximate_B(sim_env, X[t], U[t]) f_x_s[t] = A * sim_env.dt + np.eye(n) # magic trick f_u_s[t] = B * sim_env.dt # then for normal costs and their derivations l, l_x, l_xx, l_u, l_uu, l_ux = cost_inter(sim_env, X[t], U[t]) l_s[t] = l * sim_env.dt l_x_s[t] = l_x * sim_env.dt l_xx_s[t] = l_xx * sim_env.dt l_u_s[t] = l_u * sim_env.dt l_uu_s[t] = l_uu * sim_env.dt l_ux_s[t] = l_ux * sim_env.dt # and separately for final state l_s[-1], l_x_s[-1], l_xx_s[-1] = cost_final(sim_env, X[tN - 1]) return f_u_s, f_x_s, l_u_s, l_uu_s, l_ux_s, l_x_s, l_xx_s, l_s
def calc_ilqr_input(env, sim_env, tN=50, max_iter=1e6): """Calculate the optimal control input for the given state. Parameters ---------- env: gym.core.Env This is the true environment you will execute the computed commands on. Use this environment to get the Q and R values as well as the state. sim_env: gym.core.Env A copy of the env class. Use this to simulate the dynamics when doing finite differences. tN: number of control steps you are going to execute max_itr: max iterations for optmization Returns ------- U: np.array The SEQUENCE of commands to execute. The size should be (tN, #parameters) """ x0 = env.state.copy() Q = env.Q R = env.R # U = np.array([env.action_space.sample() for _ in range(tN)]) U = np.zeros((tN, 2)) m = x0.shape[0] n = U[0].shape[0] dt = 1e-3 cost = 0 reg = np.eye(n) * 1.0 costs = [] for i in range(int(max_iter)): # Get state trajectory X = simulate(sim_env, x0, U) assert U.shape[0] == tN assert X.shape[0] == tN + 1 # Initialize placeholders l = np.zeros((tN + 1, )) l_x = np.zeros((tN + 1, m)) l_xx = np.zeros((tN + 1, m, m)) l_u = np.zeros((tN, n)) l_uu = np.zeros((tN, n, n)) l_ux = np.zeros((tN, n, m)) f_x = np.zeros((tN, m, m)) f_u = np.zeros((tN, m, n)) V_x = np.zeros((tN + 1, m)) V_xx = np.zeros((tN + 1, m, m)) k = np.zeros((tN, n)) K = np.zeros((tN, n, m)) # Calculate all costs and partial derivatives for t in range(tN): x, u = X[t], U[t] l[t], l_x[t, :], l_xx[t, :], l_u[t, :], l_uu[t, :, :], l_ux[ t, :, :] = cost_inter(sim_env, x, u) # Approximate xdot(t) = A x(t) + B u(t), and x(t+1) = x(t) + xdot(t) * dt # So later x(t+1) = x(t) + (A x(t) + B u(t)) * dt A = approximate_A(sim_env, x, u) B = approximate_B(sim_env, x, u) # Dynamics is x(t+1) = f(x(t), u(t)) # Partial derivatives of f wrt x = I + A * dt f_x[t, :, :] = np.eye(m) + A * dt # Partial derivatives of f wrt x = 0 + B * dt f_u[t, :, :] = B * dt l *= dt l_x *= dt l_xx *= dt l_u *= dt l_uu *= dt l_ux *= dt l[tN], l_x[tN, :], l_xx[tN, :, :] = cost_final(sim_env, X[-1]) # Check for early convergence # =========================== curr_cost = l.sum() costs.append(curr_cost) if cost != 0: diff_perc = np.abs((curr_cost - cost) / cost) # print(f"Iter ({i}): Old Cost: {cost:.2f} Curr Cost: {curr_cost:.2f} Diff Perc: {diff_perc:.4f}") if diff_perc < 1e-3: print(f"Exiting early at iteration {i}") return U, costs cost = curr_cost # Start Dynamic Programming for Backpass # ====================================== # Initial values from the back V_x[tN, :] = l_x[tN, :].copy() V_xx[tN, :, :] = l_xx[tN, :, :].copy() for t in reversed(range(tN)): Q_x = l_x[t] + f_x[t].T @ V_x[t + 1] Q_u = l_u[t] + f_u[t].T @ V_x[t + 1] Q_xx = l_xx[t] + f_x[t].T @ V_xx[t + 1] @ f_x[t] Q_ux = l_ux[t] + f_u[t].T @ V_xx[t + 1] @ f_x[t] Q_uu = l_uu[t] + f_u[t].T @ V_xx[t + 1] @ f_u[t] # Safe inverse with regularization Q_uu_inv = pinv(Q_uu + reg) k[t, :] = -Q_uu_inv @ Q_u K[t, :, :] = -Q_uu_inv @ Q_ux # Current gradients for value function for prev timestep V_x[t] = Q_x - K[t].T @ Q_uu @ k[t] V_xx[t] = Q_xx - K[t].T @ Q_uu @ K[t] # Forward Pass # ============ updated_U = np.zeros_like(U) updated_X = np.zeros_like(X) updated_X[0, :] = x0.copy() for t in range(tN): new_x = updated_X[t] new_u = U[t] + K[t] @ (new_x - X[t]) + k[t] next_x = simulate_dynamics_next(sim_env, new_x, new_u) updated_U[t, :] = new_u updated_X[t + 1, :] = next_x X = updated_X.copy() U = updated_U.copy() final_l = l.copy() return U, costs
def calc_ilqr_input(env, sim_env, tN=100, max_iter=1e5): """Calculate the optimal control input for the given state. Parameters ---------- env: gym.core.Env This is the true environment you will execute the computed commands on. Use this environment to get the Q and R values as well as the state. sim_env: gym.core.Env A copy of the env class. Use this to simulate the dynamics when doing finite differences. tN: number of control steps you are going to execute max_itr: max iterations for optmization Returns ------- U: np.array The SEQUENCE of commands to execute. The size should be (tN, #parameters) """ x0 = env.state sim_env.state = x0.copy() dof = 2 num_states = 4 cost_list = [] U = np.zeros((tN - 1, dof)) # U = np.random.uniform(-10, 10, (tN - 1, dof)) X, cost = simulate(sim_env, x0, U) cost_list.append(cost) for ii in range(int(max_iter)): f_x = np.zeros((tN - 1, num_states, num_states)) f_u = np.zeros((tN - 1, num_states, dof)) l = np.zeros((tN, 1)) l_x = np.zeros((tN, num_states)) l_xx = np.zeros((tN, num_states, num_states)) l_u = np.zeros((tN, dof)) l_uu = np.zeros((tN, dof, dof)) l_ux = np.zeros((tN, dof, num_states)) for t in range(tN - 1): A = approximate_A(sim_env, X[t], U[t]) B = approximate_B(sim_env, X[t], U[t]) # A = approximate_A_discrete(sim_env, X[t], U[t]) # B = approximate_B_discrete(sim_env, X[t], U[t]) f_x[t] = np.eye(num_states) + A * dt f_u[t] = B * dt l[t], l_x[t], l_xx[t], l_u[t], l_uu[t], l_ux[t] = cost_inter( sim_env, X[t], U[t]) l[t] *= dt l_x[t] *= dt l_xx[t] *= dt l_u[t] *= dt l_uu[t] *= dt l_ux[t] *= dt l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, X[-1]) l[-1] *= dt l_x[-1] *= dt l_xx[-1] *= dt V_x = l_x[-1].copy() # dV / dx V_xx = l_xx[-1].copy() # d^2 V / dx^2 k = np.zeros((tN - 1, dof)) K = np.zeros((tN - 1, dof, num_states)) for t in range(tN - 2, -1, -1): Q_x = l_x[t] + np.dot(f_x[t].T, V_x) Q_u = l_u[t] + np.dot(f_u[t].T, V_x) Q_xx = l_xx[t] + np.dot(f_x[t].T, np.dot(V_xx, f_x[t])) Q_ux = l_ux[t] + np.dot(f_u[t].T, np.dot(V_xx, f_x[t])) Q_uu = l_uu[t] + np.dot(f_u[t].T, np.dot(V_xx, f_u[t])) Q_uu_inv = np.linalg.pinv(Q_uu + 1e-2 * np.eye(2)) # Q_uu_inv = inv_stable(Q_uu) k[t] = -np.dot(Q_uu_inv, Q_u) K[t] = -np.dot(Q_uu_inv, Q_ux) V_x = Q_x - np.dot(K[t].T, np.dot(Q_uu, k[t])) V_xx = Q_xx - np.dot(K[t].T, np.dot(Q_uu, K[t])) U_ = np.zeros((tN - 1, dof)) x_ = x0.copy() # 7a) for t in range(tN - 1): U_[t] = U[t] + k[t] + np.dot(K[t], x_ - X[t]) x_ = simulate_dynamics_next(sim_env, x_, U_[t]) Xnew, costnew = simulate(sim_env, x0, U_) # print(cost) # print(costnew) cost_list.append(costnew) X = np.copy(Xnew) U = np.copy(U_) oldcost = np.copy(cost) cost = np.copy(costnew) if abs(oldcost - cost) < EPS_CONVERGE: break # print(cost) return U, cost, cost_list
def calc_ilqr_input(env, sim_env,x0, u_seq, tN=50, max_iter=1e6): """Calculate the optimal control input for the given state. Parameters ---------- env: gym.core.Env This is the true environment you will execute the computed commands on. Use this environment to get the Q and R values as well as the state. sim_env: gym.core.Env A copy of the env class. Use this to simulate the dynamics when doing finite differences. tN: number of control steps you are going to execute max_itr: max iterations for optmization Returns ------- U: np.array The SEQUENCE of commands to execute. The size should be (tN, #parameters) """ state_dim = x0.shape[0] action_dim = u_seq.shape[1] lamb = 1.0 lamb_fac = 2.0 lamb_max = 1000 iter_num = 0 gen_new_traj = True while(iter_num < max_iter): if gen_new_traj: pdb.set_trace() x_seq, cost, reward = simulate(sim_env, x0, u_seq) old_cost = cost * 1.0 env.state = x0 * 1.0 l, l_x, l_xx, l_u, l_uu, l_ux, f_x, f_u = reset_cost_dynamics(tN,state_dim, action_dim) l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, x_seq[-1]) for i in range(tN-1, -1, -1): x = x_seq[i] * 1.0 u = u_seq[i] * 1.0 l[i], l_x[i], l_xx[i], l_u[i], l_uu[i], l_ux[i] = cost_inter(sim_env, x, u) A = approximate_A(sim_env, simulate_dynamics_next, x, u) B = approximate_B(sim_env, simulate_dynamics_next, x, u) f_x[i] = A * 1.0 f_u[i] = B * 1.0 gen_new_traj = False pdb.set_trace() k = np.zeros((tN, action_dim)) K = np.zeros((tN, action_dim, state_dim)) k,K = backward_recursion(k, K, l, l_x, l_xx, l_u. l_uu, l_ux, f_x, f_u , lamb) return np.zeros((50, 2))
def calc_ilqr_input(env, sim_env, x0, u_seq, tN=50, max_iter=1e6): """Calculate the optimal control input for the given state. Parameters ---------- env: gym.core.Env This is the true environment you will execute the computed commands on. Use this environment to get the Q and R values as well as the state. sim_env: gym.core.Env A copy of the env class. Use this to simulate the dynamics when doing finite differences. tN: number of control steps you are going to execute max_itr: max iterations for optmization Returns ------- U: np.array The SEQUENCE of commands to execute. The size should be (tN, #parameters) """ state_dim = x0.shape[0] action_dim = u_seq.shape[1] lamb = 1.0 lamb_fac = 2.0 lmab_max = 1000 alpha = 1.0 iter_num = 0 gen_new_traj = True total_cost = [] while iter_num < max_iter: # print("iter: {}".format(iter_num)) if gen_new_traj: x_seq, cost = simulate(sim_env, x0, u_seq) old_cost = cost * 1.0 env.state = x0 * 1.0 l, l_x, l_xx, l_u, l_uu, l_ux, f_x, f_u = reset_cost_dynamics( tN, state_dim, action_dim) l[-1], l_x[-1], l_xx[-1] = cost_final(sim_env, x_seq[-1]) for i in range(tN - 1, -1, -1): x = x_seq[i] * 1.0 u = u_seq[i] * 1.0 l[i], l_x[i], l_xx[i], l_u[i], l_uu[i], l_ux[i] = cost_inter( sim_env, x, u) A = approximate_A(sim_env, simulate_dynamics_next, x, u) B = approximate_B(sim_env, simulate_dynamics_next, x, u) f_x[i] = A * 1.0 f_u[i] = B * 1.0 gen_new_traj = False k = np.zeros((tN, action_dim)) K = np.zeros((tN, action_dim, state_dim)) # k: open loop term; K: feedback gain term k, K = backward_recursion(k, K, l, l_x, l_xx, l_u, l_uu, l_ux, f_x, f_u, lamb) u_seq_new = np.zeros((tN, action_dim)) x_new = x_seq[0] * 1.0 sim_env.state = x_new * 1.0 for i in range(0, tN - 1): u_seq_new[i] = u_seq[i] + alpha * k[i] + np.matmul( K[i], x_new - x_seq[i]) x_new, _, _, _ = sim_env.step(u_seq_new[i]) x_seq_new, new_cost = simulate(env, x0, u_seq_new) total_cost.append(new_cost * 1.0) if new_cost < cost: gen_new_traj = True lamb /= lamb_fac x_seq = x_seq_new * 1.0 u_seq = u_seq_new * 1.0 alpha = 1.0 old_cost = cost * 1.0 cost = new_cost * 1.0 if (iter_num > 0 and abs(old_cost - cost) < 0.001 * cost): # print("new_cost: {}".format(cost)) # cost = new_cost * 1.0 break # cost = new_cost*1.0 else: lamb *= lamb_fac if lamb > lmab_max: # print("cost: {}".format(cost)) break iter_num += 1 return u_seq, total_cost