def __init__(self, env, nb_steps, activation=range(-1, 0)): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_inv_dyn = self.env.unwrapped.inverse_dynamics self.env_cost = self.env.unwrapped.cost self.env_init = self.env.unwrapped.init self.env_goal = self.env.unwrapped.goal self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # reference trajectory self.xref = np.zeros((self.dm_state, self.nb_steps + 1)) self.xref[..., 0] = self.env_init()[0] self.uref = np.zeros((self.dm_act, self.nb_steps)) self.gocost = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.comecost = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.gocost.V[..., 0] += np.eye(self.dm_state) * 1e-16 self.comecost.V[..., 0] += np.eye(self.dm_state) * 1e-16 self.dyn = AnalyticalLinearDynamics(self.env_init, self.env_dyn, self.dm_state, self.dm_act, self.nb_steps) self.idyn = AnalyticalLinearDynamics(self.env_init, self.env_inv_dyn, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) self.ctl.kff = np.random.randn(self.dm_act, self.nb_steps) self.ictl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) self.ictl.kff = 1e-2 * np.random.randn(self.dm_act, self.nb_steps) # activation of cost function self.activation = np.zeros((self.nb_steps + 1, ), dtype=np.int64) self.activation[-1] = 1. # last step always in self.activation[activation] = 1. self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_objective = -np.inf
class eLQR: def __init__(self, env, nb_steps, init_state): self.env = env # expose necessary functions self.env_dyn = self.env.unwrapped.dynamics self.env_inv_dyn = self.env.unwrapped.inverse_dynamics self.env_cost = self.env.unwrapped.cost self.env_goal = self.env.unwrapped.g self.env_init = init_state self.ulim = self.env.action_space.high self.dm_state = self.env.observation_space.shape[0] self.dm_act = self.env.action_space.shape[0] self.nb_steps = nb_steps # reference trajectory self.xref = np.zeros((self.dm_state, self.nb_steps + 1)) self.xref[..., 0] = self.env_init self.uref = np.zeros((self.dm_act, self.nb_steps)) self.gocost = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.comecost = QuadraticStateValue(self.dm_state, self.nb_steps + 1) self.gocost.V[..., 0] += np.eye(self.dm_state) * 1e-16 self.comecost.V[..., 0] += np.eye(self.dm_state) * 1e-16 self.dyn = AnalyticalLinearDynamics(self.env_dyn, self.dm_state, self.dm_act, self.nb_steps) self.idyn = AnalyticalLinearDynamics(self.env_inv_dyn, self.dm_state, self.dm_act, self.nb_steps) self.ctl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) self.ctl.kff = np.random.randn(self.dm_act, self.nb_steps) self.ictl = LinearControl(self.dm_state, self.dm_act, self.nb_steps) self.ictl.kff = 1e-2 * np.random.randn(self.dm_act, self.nb_steps) self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1) self.last_objective = -np.inf def forward_pass(self, ctl): state = np.zeros((self.dm_state, self.nb_steps + 1)) action = np.zeros((self.dm_act, self.nb_steps)) cost = np.zeros((self.nb_steps + 1, )) state[..., 0] = self.env_init for t in range(self.nb_steps): action[..., t] = ctl.action(state[..., t], t) cost[..., t] = self.cost.evalf(state[..., t], action[..., t]) state[..., t + 1] = self.dyn.evalf(state[..., t], action[..., t]) cost[..., -1] = self.cost.evalf(state[..., -1], np.zeros((self.dm_act, ))) return state, action, cost def forward_lqr(self, state): for t in range(self.nb_steps): _action = self.ctl.action(state, t) _state_n = self.dyn.evalf(state, _action) # linearize inverse discrete dynamics _A, _B, _c = self.idyn.taylor_expansion(_state_n, _action) # quadratize cost _Cxx, _Cuu, _Cxu, _cx, _cu, _c0 = self.cost.taylor_expansion( state, _action) # forward value _Qxx = _A.T @ (_Cxx + self.comecost.V[..., t]) @ _A _Quu = _B.T @ (_Cxx + self.comecost.V[..., t] ) @ _B + _B.T @ _Cxu + _Cxu.T @ _B + _Cuu _Qux = _B.T @ (_Cxx + self.comecost.V[..., t]) @ _A + _Cxu.T @ _A _qx = _A.T @ (_Cxx + self.comecost.V[..., t]) @ _c + _A.T @ ( _cx + self.comecost.v[..., t]) _qu = _B.T @ ( _Cxx + self.comecost.V[..., t]) @ _c + _Cxu.T @ _c + _B.T @ ( _cx + self.comecost.v[..., t]) + _cu _q0 = 0.5 * _c.T @ (_Cxx + self.comecost.V[..., t]) @ _c +\ _c.T @ (_cx + self.comecost.v[..., t]) + _c0 + self.comecost.v0[..., t] # backward value _Qxx = _A.T @ (_Cxx + self.comecost.V[..., t]) @ _A _Quu = _B.T @ (_Cxx + self.comecost.V[..., t] ) @ _B + _B.T @ _Cxu + _Cxu.T @ _B + _Cuu _Qux = _B.T @ (_Cxx + self.comecost.V[..., t]) @ _A + _Cxu.T @ _A self.ictl.K[..., t] = -np.linalg.inv(_Quu) @ _Qux self.ictl.kff[..., t] = -np.linalg.inv(_Quu) @ _qu self.comecost.V[..., t + 1] = _Qxx - _Qux.T @ np.linalg.inv(_Quu) @ _Qux self.comecost.v[..., t + 1] = _qx - _Qux.T @ np.linalg.inv(_Quu) @ _qu self.comecost.v0[..., t + 1] = _q0 - 0.5 * _qu.T @ np.linalg.inv(_Quu) @ _qu # store matrices self.idyn.A[..., t] = _A self.idyn.B[..., t] = _B self.idyn.c[..., t] = _c state = - np.linalg.inv(self.gocost.V[..., t + 1] + self.comecost.V[..., t + 1]) @\ (self.gocost.v[..., t + 1] + self.comecost.v[..., t + 1]) return state def backward_lqr(self, state): # quadratize last cost _Cxx, _Cuu, _Cxu, _cx, _cu, _c0 =\ self.cost.taylor_expansion(state, np.zeros((self.dm_act, ))) self.gocost.V[..., -1] = _Cxx self.gocost.v[..., -1] = _cx self.gocost.v0[..., -1] = _c0 state = - np.linalg.inv(self.gocost.V[..., -1] + self.comecost.V[..., -1]) @\ (self.gocost.v[..., -1] + self.comecost.v[..., -1]) for t in range(self.nb_steps - 1, -1, -1): _action = self.ictl.action(state, t) _state_n = self.idyn.evalf(state, _action) # linearize discrete dynamics _A, _B, _c = self.dyn.taylor_expansion(_state_n, _action) # quadratize cost _Cxx, _Cuu, _Cxu, _cx, _cu, _c0 = self.cost.taylor_expansion( _state_n, _action) # backward value _Qxx = _Cxx + _A.T @ self.gocost.V[..., t + 1] @ _A _Quu = _Cuu + _B.T @ self.gocost.V[..., t + 1] @ _B _Qux = _Cxu.T + _B.T @ self.gocost.V[..., t + 1] @ _A _qx = _cx + _A.T @ self.gocost.V[ ..., t + 1] @ _c + _A.T @ self.gocost.v[..., t + 1] _qu = _cu + _B.T @ self.gocost.V[ ..., t + 1] @ _c + _B.T @ self.gocost.v[..., t + 1] _q0 = _c0 + self.gocost.v0[..., t + 1] + 0.5 * _c.T @ self.gocost.V[..., t + 1] @ _c +\ _c.T @ self.gocost.v[..., t + 1] self.ctl.K[..., t] = -np.linalg.inv(_Quu) @ _Qux self.ctl.kff[..., t] = -np.linalg.inv(_Quu) @ _qu self.gocost.V[..., t] = _Qxx - _Qux.T @ np.linalg.inv(_Quu) @ _Qux self.gocost.v[..., t] = _qx - _Qux.T @ np.linalg.inv(_Quu) @ _qu self.gocost.v0[..., t] = _q0 - 0.5 * _qu.T @ np.linalg.inv(_Quu) @ _qu # store matrices self.dyn.A[..., t] = _A self.dyn.B[..., t] = _B self.dyn.c[..., t] = _c state = - np.linalg.inv(self.gocost.V[..., t] + self.comecost.V[..., t]) @\ (self.gocost.v[..., t] + self.comecost.v[..., t]) return state def plot(self): import matplotlib.pyplot as plt plt.figure() t = np.linspace(0, self.nb_steps, self.nb_steps + 1) for k in range(self.dm_state): plt.subplot(self.dm_state + self.dm_act, 1, k + 1) plt.plot(t, self.xref[k, :], '-b') t = np.linspace(0, self.nb_steps, self.nb_steps) for k in range(self.dm_act): plt.subplot(self.dm_state + self.dm_act, 1, self.dm_state + k + 1) plt.plot(t, self.uref[k, :], '-g') plt.show() def run(self, nb_iter=10): _trace = [] # forward pass to get ref traj. self.xref, self.uref, _cost = self.forward_pass(self.ctl) # return around current traj. _trace.append(np.sum(_cost)) _state = self.env_init for _ in range(nb_iter): # forward lqr _state = self.forward_lqr(_state) # backward lqr _state = self.backward_lqr(_state) # forward pass to get ref traj. self.xref, self.uref, _cost = self.forward_pass(self.ctl) # return around current traj. _trace.append(np.sum(_cost)) return _trace