def init_gym_pol(hyperparams): """Generates a random inital policy for gym experiments.""" env = gym.make(hyperparams['env']) if is_goal_based(env): dX = (env.observation_space.spaces['observation'].shape[0] + env.observation_space.spaces['desired_goal'].shape[0]) else: dX = env.observation_space.shape[0] dU = env.action_space.shape[0] T = hyperparams['T'] low = np.asarray(env.action_space.low) high = np.array(env.action_space.high) K = np.zeros((T, dU, dX)) k = np.empty((T, dU)) PSig = np.empty((T, dU, dU)) cholPSig = np.empty((T, dU, dU)) inv_pol_covar = np.empty((T, dU, dU)) for t in range(T): k[t] = (low + high) / 2 PSig[t] = np.diag( np.square(high - low) / 12) * hyperparams['init_var_scale'] cholPSig[t] = np.linalg.cholesky(PSig[t]) inv_pol_covar[t] = np.linalg.inv(PSig[t]) return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
def init_pd(hyperparams): """ This function initializes the linear-Gaussian controller as a proportional-derivative (PD) controller with Gaussian noise. The position gains are controlled by the variable pos_gains, velocity gains are controlled by pos_gains*vel_gans_mult. """ config = copy.deepcopy(INIT_LG_PD) config.update(hyperparams) dU, dQ, dX = config['dU'], config['dQ'], config['dX'] x0, T = config['x0'], config['T'] # Choose initialization mode. Kp = 1.0 Kv = config['vel_gains_mult'] if dU < dQ: K = -config['pos_gains'] * np.tile( [np.eye(dU) * Kp, np.zeros((dU, dQ-dU)), np.eye(dU) * Kv, np.zeros((dU, dQ-dU))], [T, 1, 1] ) else: K = -config['pos_gains'] * np.tile( np.hstack([ np.eye(dU) * Kp, np.eye(dU) * Kv, np.zeros((dU, dX - dU*2)) ]), [T, 1, 1] ) k = np.tile(-K[0, :, :].dot(x0), [T, 1]) PSig = config['init_var'] * np.tile(np.eye(dU), [T, 1, 1]) cholPSig = np.sqrt(config['init_var']) * np.tile(np.eye(dU), [T, 1, 1]) invPSig = (1.0 / config['init_var']) * np.tile(np.eye(dU), [T, 1, 1]) return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
def init_pol_ctr(hyperparams): """Generates an initial policy by loading linear controllers from a file.""" dU, dX = hyperparams['dU'], hyperparams['dX'] T = hyperparams['T'] data = np.load(hyperparams['ctr_file']) K = np.empty((T, dU, dX)) k = np.empty((T, dU)) prc = np.empty((T, dU, dU)) K[:-1] = data['K'][0] K[-1] = np.zeros((dU, dX)) k[:-1] = data['k'][0] k[-1] = np.zeros((dU)) prc[:-1] = data['prc'][0] prc[-1] = np.eye(dU) PSig = np.empty((T, dU, dU)) cholPSig = np.empty((T, dU, dU)) inv_pol_covar = np.empty((T, dU, dU)) for t in range(T): PSig[t] = np.linalg.inv(prc[t]) cholPSig[t] = np.linalg.cholesky(PSig[t]) inv_pol_covar[t] = np.linalg.inv(PSig[t]) return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
def init_superball(hyperparams): config = copy.deepcopy(INIT_LG_PD) config.update(hyperparams) T, dU, dX = config['T'], config['dU'], config['dX'] if 'init' in config and config['init']: filename = hyperparams[ 'file'] if 'file' in hyperparams else 'init_controllers.mat' from scipy import io s = config['init'] controllers = io.loadmat(filename) K, k = controllers['K' + s], controllers['k' + s] PSig = controllers['PS' + s] cholPSig = controllers['cPS' + s] invPSig = controllers['iPS' + s] if 'init_with_var' in config and config['init_with_var']: identity_matrix = np.stack( [np.eye(PSig.shape[1]) for _ in xrange(PSig.shape[0])]) PSig = identity_matrix * config['init_with_var'] cholPSig = identity_matrix * np.sqrt(config['init_with_var']) invPSig = identity_matrix / config['init_with_var'] else: K = np.zeros((T, dU, dX)) k = np.zeros((T, dU)) PSig = config['init_var'] * np.tile(np.eye(dU), [T, 1, 1]) cholPSig = np.sqrt(config['init_var']) * np.tile(np.eye(dU), [T, 1, 1]) invPSig = (1.0 / config['init_var']) * np.tile(np.eye(dU), [T, 1, 1]) return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
def init_cmaes_controller(hyperparams, agent): config = copy.deepcopy(INIT_LG) config.update(hyperparams) dX, dU = config['dX'], config['dU'] T = config['T'] cur_cond_idx = config['cur_cond_idx'] history_len = agent.history_len fcn = agent.fcns[cur_cond_idx] popsize = agent.popsize if 'fcn_obj' in fcn: fcn_obj = fcn['fcn_obj'] else: fcn_obj = None hpolib = False if 'hpolib' in fcn: hpolib = True benchmark = None if 'benchmark' in fcn: benchmark = fcn['benchmark'] # Create new world to avoiding changing the state of the original world world = CMAESWorld(dim=fcn['dim'], init_loc=fcn['init_loc'], init_sigma=fcn['init_sigma'], init_popsize=popsize, history_len=history_len, fcn=fcn_obj, hpolib=hpolib, benchmark=benchmark) if config['verbose']: print("Finding Initial Linear Gaussian Controller") action_mean = [] action_var = [] for i in range(25): f_values=[] cur_policy = CSAPolicy(T=T) world.reset_world() world.run() for t in range(T): X_t = agent.get_vectorized_state(world.get_state(), cur_cond_idx) es = world.es f_vals = world.func_values U_t = cur_policy.act(X_t, None, t, np.zeros((dU,)), es, f_vals) world.run_next(U_t) f_values.append(U_t) action_mean.append(f_values)# np.mean(f_values, axis=0)) action_var.append(f_values)# np.mean(f_values, axis=0)) mean_actions = np.mean(action_mean, axis=0) var_actions = np.std(action_var, axis=0) np.place(var_actions, var_actions==0, config["init_var"]) Kt = np.zeros((dU, dX)) # K matrix for a single time step. kt = mean_actions.reshape((T,1)) #print("Mean actions: %s" % kt, flush=True) K = np.tile(Kt[None,:,:], (T, 1, 1)) # Controller gains matrix. k = kt PSig = var_actions.reshape((T, 1, 1)) cholPSig = np.sqrt(var_actions).reshape((T, 1, 1)) invPSig = 1./var_actions.reshape((T, 1, 1)) return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
def fit_emp_controller(demo_x, demo_u): """ Fit the conditional covariance of u|x of a set of demonstrations. """ T = np.shape(demo_u)[1] dX = np.shape(demo_x)[2] dU = np.shape(demo_u)[2] N = np.shape(demo_u)[0] K = np.zeros((T, dU, dX)) k = np.zeros((T, dU)) pol_covar = np.zeros((T, dU, dU)) chol_pol_covar = np.zeros((T, dU, dU)) inv_pol_covar = np.zeros((T, dU, dU)) traj = LinearGaussianPolicy(K, k, pol_covar, chol_pol_covar, inv_pol_covar) for t in xrange(T): X = np.hstack((np.squeeze(demo_x[:, t, :]), np.ones((N, 1)))) U = np.squeeze(demo_u[:, t, :]) result = np.linalg.pinv(X).dot(U).T traj.K[t, :, :] = result[:, 0: dX] traj.k[t, :] = result[:, -1] sig = np.zeros((dU, dU)) for n in xrange(N): diff = (demo_u[n, t, :] - (traj.K[t, :, :].dot(demo_x[n, t, :]) + traj.k[t, :])).reshape(dU, -1) sig += diff.dot(diff.T) traj.pol_covar[t, :, :] = sig / N + 1e-12 * np.eye(dU) traj.chol_pol_covar[t, :, :] = np.linalg.cholesky(traj.pol_covar[t, :, :]) traj.inv_pol_covar[t, :, :] = np.linalg.pinv(traj.pol_covar[t, :, :]) return traj
def traj_distr(self): """ Create a trajectory distribution object from policy info. """ T, dU, dX = self.pol_K.shape # Compute inverse policy covariances. inv_pol_S = np.empty_like(self.chol_pol_S) for t in range(T): inv_pol_S[t, :, :] = np.linalg.solve( self.chol_pol_S[t, :, :], np.linalg.solve(self.chol_pol_S[t, :, :].T, np.eye(dU))) return LinearGaussianPolicy(self.pol_K, self.pol_k, self.pol_S, self.chol_pol_S, inv_pol_S)
def init_pol(hyperparams): """Generates an initial policy of constant actions with added noise.""" dU, dX = hyperparams['dU'], hyperparams['dX'] T = hyperparams['T'] K = np.zeros((T, dU, dX)) k = np.empty((T, dU)) PSig = np.empty((T, dU, dU)) cholPSig = np.empty((T, dU, dU)) inv_pol_covar = np.empty((T, dU, dU)) for t in range(T): k[t] = hyperparams['init_const'] PSig[t] = hyperparams['init_var'] cholPSig[t] = np.linalg.cholesky(PSig[t]) inv_pol_covar[t] = np.linalg.inv(PSig[t]) return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
def init_lqr(hyperparams): """ Return initial gains for a time-varying linear Gaussian controller that tries to hold the initial position. """ config = copy.deepcopy(INIT_LG_LQR) config.update(hyperparams) x0, dX, dU = config['x0'], config['dX'], config['dU'] dt, T = config['dt'], config['T'] #TODO: Use packing instead of assuming which indices are the joint # angles. # Notation notes: # L = loss, Q = q-function (dX+dU dimensional), # V = value function (dX dimensional), F = dynamics # Vectors are lower-case, matrices are upper case. # Derivatives: x = state, u = action, t = state+action (trajectory). # The time index is denoted by _t after the above. # Ex. Ltt_t = Loss, 2nd derivative (w.r.t. trajectory), # indexed by time t. # Constants. idx_x = slice(dX) # Slices out state. idx_u = slice(dX, dX+dU) # Slices out actions. if len(config['init_acc']) == 0: config['init_acc'] = np.zeros(dU) if len(config['init_gains']) == 0: config['init_gains'] = np.ones(dU) # Set up simple linear dynamics model. Fd, fc = guess_dynamics(config['init_gains'], config['init_acc'], dX, dU, dt) # Setup a cost function based on stiffness. # Ltt = (dX+dU) by (dX+dU) - Hessian of loss with respect to # trajectory at a single timestep. Ltt = np.diag(np.hstack([ config['stiffness'] * np.ones(dU), config['stiffness'] * config['stiffness_vel'] * np.ones(dU), np.zeros(dX - dU*2), np.ones(dU) ])) Ltt = Ltt / config['init_var'] # Cost function - quadratic term. lt = -Ltt.dot(np.r_[x0, np.zeros(dU)]) # Cost function - linear term. # Perform dynamic programming. K = np.zeros((T, dU, dX)) # Controller gains matrix. k = np.zeros((T, dU)) # Controller bias term. PSig = np.zeros((T, dU, dU)) # Covariance of noise. cholPSig = np.zeros((T, dU, dU)) # Cholesky decomposition. invPSig = np.zeros((T, dU, dU)) # Inverse of covariance. vx_t = np.zeros(dX) # Vx = dV/dX. Derivative of value function. Vxx_t = np.zeros((dX, dX)) # Vxx = ddV/dXdX. #TODO: A lot of this code is repeated with traj_opt_lqr_python.py # backward pass. for t in range(T - 1, -1, -1): # Compute Q function at this step. if t == (T - 1): Ltt_t = config['final_weight'] * Ltt lt_t = config['final_weight'] * lt else: Ltt_t = Ltt lt_t = lt # Qtt = (dX+dU) by (dX+dU) 2nd Derivative of Q-function with # respect to trajectory (dX+dU). Qtt_t = Ltt_t + Fd.T.dot(Vxx_t).dot(Fd) # Qt = (dX+dU) 1st Derivative of Q-function with respect to # trajectory (dX+dU). qt_t = lt_t + Fd.T.dot(vx_t + Vxx_t.dot(fc)) # Compute preceding value function. U = sp.linalg.cholesky(Qtt_t[idx_u, idx_u]) L = U.T invPSig[t, :, :] = Qtt_t[idx_u, idx_u] PSig[t, :, :] = sp.linalg.solve_triangular( U, sp.linalg.solve_triangular(L, np.eye(dU), lower=True) ) cholPSig[t, :, :] = sp.linalg.cholesky(PSig[t, :, :]) K[t, :, :] = -sp.linalg.solve_triangular( U, sp.linalg.solve_triangular(L, Qtt_t[idx_u, idx_x], lower=True) ) k[t, :] = -sp.linalg.solve_triangular( U, sp.linalg.solve_triangular(L, qt_t[idx_u], lower=True) ) Vxx_t = Qtt_t[idx_x, idx_x] + Qtt_t[idx_x, idx_u].dot(K[t, :, :]) vx_t = qt_t[idx_x] + Qtt_t[idx_x, idx_u].dot(k[t, :]) Vxx_t = 0.5 * (Vxx_t + Vxx_t.T) return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
def init_lto_controller(hyperparams, agent): config = copy.deepcopy(INIT_LG) config.update(hyperparams) dX, dU = config['dX'], config['dU'] T = config['T'] cur_cond_idx = config['cur_cond_idx'] history_len = agent.history_len fcn = agent.fcns[cur_cond_idx] # Create new world to avoiding changing the state of the original world world = LTOWorld(fcn['fcn_obj'], fcn['dim'], fcn['init_loc'], history_len) # Compute initial state. world.reset_world() world.run() x0 = agent.get_vectorized_state(world.get_state(), cur_cond_idx) best_momentum = None best_learning_rate = None min_obj_val = float('Inf') if config['verbose']: print("Finding Initial Linear Gaussian Controller") for i in range(config['all_possible_momentum_params'].shape[0]): cur_momentum = config['all_possible_momentum_params'][i] for j in range(config['all_possible_learning_rates'].shape[0]): cur_learning_rate = config['all_possible_learning_rates'][j] cur_Kt = np.zeros((dU, dX)) # K matrix for a single time step. # Equivalent to Kt[:,sensor_start_idx:sensor_end_idx] = np.eye(dU) ## Why use CUR_GRAD and PAST_GRADS in state??? ##agent.pack_data_x(cur_Kt, np.eye(dU), data_types=[CUR_GRAD]) # Oldest gradients come first ##agent.pack_data_x(cur_Kt, np.tile(np.eye(dU),(1,history_len)) * (cur_momentum ** np.ravel(np.tile(np.arange(history_len,0,-1)[:,None],(1,dU))))[None,:], data_types=[PAST_GRADS]) cur_Kt = -cur_learning_rate*cur_Kt cur_kt = np.dot(cur_Kt, x0) cur_policy = LinearGaussianPolicy(cur_Kt[None,:,:], cur_kt[None,:], np.zeros((1,dU,dU)), np.zeros((1,dU,dU)), np.zeros((1,dU,dU))) world.reset_world() world.run() for t in range(T): #print(cur_cond_idx) #print(world.get_state()) X_t = agent.get_vectorized_state(world.get_state(), cur_cond_idx) U_t = cur_policy.act(X_t, None, 0, np.zeros((dU,))) world.run_next(U_t) fcn['fcn_obj'].new_sample(batch_size="all") cur_obj_val = fcn['fcn_obj'].evaluate(world.cur_loc) if config['verbose']: print("Learning Rate: %.4f, Momentum: %.4f, Final Objective Value: %.4f" % (cur_learning_rate,cur_momentum,cur_obj_val)) if cur_obj_val < min_obj_val: min_obj_val = cur_obj_val best_momentum = cur_momentum best_learning_rate = cur_learning_rate if config['verbose']: print("") print("Best Final Objective Value: %.4f" % (min_obj_val)) print("Best Momentum: %.4f" % (best_momentum)) print("Best Learning Rate: %.4f" % (best_learning_rate)) print("------------------------------------------------------") Kt = np.zeros((dU, dX)) # K matrix for a single time step. # Equivalent to Kt[:,sensor_start_idx:sensor_end_idx] = np.eye(dU) ##agent.pack_data_x(Kt, np.eye(dU), data_types=[CUR_GRAD]) # Oldest gradients come first ##agent.pack_data_x(Kt, np.tile(np.eye(dU),(1,history_len)) * (best_momentum ** np.ravel(np.tile(np.arange(history_len,0,-1)[:,None],(1,dU))))[None,:], data_types=[PAST_GRADS]) Kt = -best_learning_rate*Kt kt = np.dot(Kt, x0) K = np.tile(Kt[None,:,:], (T, 1, 1)) # Controller gains matrix. k = np.tile(kt[None,:], (T, 1)) PSig = np.tile((config['init_var']*np.eye(dU))[None,:,:], (T, 1, 1)) cholPSig = np.tile((np.sqrt(config['init_var'])*np.eye(dU))[None,:,:], (T, 1, 1)) invPSig = np.tile(((1./config['init_var'])*np.eye(dU))[None,:,:], (T, 1, 1)) return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)