def _learn(self, training_inputs, training_targets): assert self.ninducing_points # Subsampling the data if required. if self.nsampled_training_points: training_inputs, training_targets = self._subsample_training_set( training_inputs, training_targets) # Full GP if no inducing points are provided. self.pilco_ = PILCO(training_inputs, training_targets, self.ninducing_points) self.pilco_.optimize_models(disp=True)
def load_pilco(path, controller=None, reward=None, sparse=False): X = np.loadtxt(path + 'X.csv', delimiter=',') Y = np.loadtxt(path + 'Y.csv', delimiter=',') if not sparse: pilco = PILCO((X, Y), controller=controller, reward=reward) else: with open(path + 'n_ind.txt', 'r') as f: n_ind = int(f.readline()) f.close() pilco = PILCO((X, Y), num_induced_points=n_ind, controller=controller, reward=reward) params = np.load(path + "pilco_values.npy", allow_pickle=True).item() # pilco.assign(params) for i, m in enumerate(pilco.mgpr.models): values = np.load(path + "model_" + str(i) + ".npy", allow_pickle=True).item() # m.assign(values) return pilco
X_, Y_, rewards = rollout(env=env, pilco=None, random=True, timesteps=40) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) random_rewards.append(sum(rewards)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) # controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO(X, Y, controller=controller, horizon=40) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0])) # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R) # Example of fixing a parameter, optional, for a linear controller only # pilco.controller.b = np.array([[0.0]]) # pilco.controller.b.trainable = False for rollouts in range(3): pilco.optimize_models() pilco.optimize_policy() X_new, Y_new, _ = rollout(env=env, pilco=pilco, timesteps=100) # Update dataset X = np.vstack((X, X_new))
# Initial random rollouts to generate a dataset X,Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS) for i in range(1,J): X_, Y_ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) # for numerical stability for model in pilco.mgpr.models: # model.kern.lengthscales.prior = gpflow.priors.Gamma(1,10) priors have to be included before # model.kern.variance.prior = gpflow.priors.Gamma(1.5,2) before the model gets compiled model.likelihood.variance = 0.001 model.likelihood.variance.trainable = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=maxiter, restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS)
def safe_swimmer_run(seed=0, logging=False): env = SwimmerWrapper() state_dim = 9 control_dim = 2 SUBS = 2 maxiter = 60 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.05 * np.eye(state_dim) J = 1 N = 12 T = 25 bf = 30 T_sim = 100 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[0] = 0.5 max_ang = (100 / 180 * np.pi) * 0.95 R1 = LinearReward(state_dim, weights_l) C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False) C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False) C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False) R = CombinedRewards(state_dim, [R1, C1, C2, C3], coefs=[1.0, -10.0, -10.0, -10.0]) th = 0.2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) new_data = True eval_runs = T_sim evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=1, restarts=2) m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risk1 = np.zeros(T) predicted_risk2 = np.zeros(T) predicted_risk3 = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risk1[h], _ = C1.compute_reward(m_h, S_h) predicted_risk2[h], _ = C2.compute_reward(m_h, S_h) predicted_risk3[h], _ = C3.compute_reward(m_h, S_h) estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1) estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2) estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3) overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * ( 1 - estimate_risk3) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) new_data = True # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if estimate_risk1 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0]) if estimate_risk2 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0]) if estimate_risk3 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75]) else: print("*********CHANGING***********") if estimate_risk1 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0]) if estimate_risk2 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0]) if estimate_risk3 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5]) _, _, r = pilco.predict(m_init, S_init, T)
m_init = np.transpose(X[0, :-1, None]) S_init = 0.1 * np.eye(state_dim) max_yerror = 0.3 np.save(main_path + "/SAVED/data_mean_kn", data_mean) np.save(main_path + "/SAVED/data_std_kn", data_std) W = np.diag([1e-2, 1e-2, 1e-3, 1e-3]) # Weight of states T_sim = 600 target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std) R, controller = pilcotrac.get_RewardFun_and_Controller( state_dim, control_dim, data_mean, data_std, max_yerror, W) pilco = PILCO(X, Y, num_induced_points=100, controller=controller, horizon=T, reward=R) for model in pilco.mgpr.models: # model.clear() # model.kern.lengthscales.prior = gpflow.priors.Gamma(3,20) #priors have to be included before # model.kern.variance.prior = gpflow.priors.Gamma(1.5,4) #before the model gets compiled model.likelihood.variance = 0.1 model.likelihood.variance.trainable = False # model.compile() # vals = model.read_values() # print(vals)
render=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) # for numerical stability, we can set the likelihood variance parameters of the GP models for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) r_new = np.zeros((T, 1)) for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=maxiter, restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env,
Y = np.divide(Y1, np.std(X1[:, :2], 0)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.transpose(X[0, :-1, None]) S_init = 0.5 * np.eye(state_dim) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=25) R = ExponentialReward(state_dim=state_dim, t=np.divide([0.5, 0.0] - env.m, env.std), W=np.diag([0.5, 0.1])) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) best_r = 0 all_Rs = np.zeros((X.shape[0], 1)) for i in range(len(all_Rs)): all_Rs[i, 0] = R.compute_reward(X[i, None, :-1], 0.001 * np.eye(state_dim))[0] ep_rewards = np.zeros((len(X) // T, 1)) for i in range(len(ep_rewards)): ep_rewards[i] = sum(all_Rs[i * T:i * T + T]) for model in pilco.mgpr.models:
# Initial random rollouts to generate a dataset X, Y = rollout(policy=random_policy, timesteps=40) for i in range(1, 3): X_, Y_ = rollout(policy=random_policy, timesteps=40) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO(X, Y, controller=controller, horizon=20) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False for rollouts in range(3): pilco.optimize() import pdb pdb.set_trace() X_new, Y_new = rollout(policy=pilco_policy, timesteps=100) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_XY(X, Y)
for i in range(1,4): #No action sampling; u := 0 X_, Y_ = rollout(env=env, pilco=None, random=None, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) #print(X) #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72) # Example of user provided reward function, setting a custom target state R = ExponentialReward(state_dim=state_dim, t=target) pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, num_induced_points=72, m_init=m_init) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False plot_model = False # Plot model set to false until iteration 5 is reached for rollouts in range(20): print("Optimizing models") pilco.optimize_models() print("Optimizing controller") pilco.optimize_policy(maxiter=20) #import pdb; pdb.set_trace() X_new, Y_new = rollout(env=env, pilco=pilco, timesteps=120, SUBS=subs, render=False) print("No of ops:", len(tf.get_default_graph().get_operations())) # Update dataset X = np.vstack((X, X_new)).astype(np.float64)
def pilco_run(env, N, J, safe=False, name='', seed=0, cont=None, rew=None, SUBS=1, sim_timesteps=50, plan_timesteps=30, restarts=1, maxiter=100, m_init=None, S_init=None, fixed_noise=None, logging=False, eval_runs=5, eval_max_timesteps=None, variable_episode_length=False): np.random.seed(seed) X, Y, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) for i in range(1, J): X_, Y_, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim if cont is None: controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) elif cont['type'] == 'rbf': controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=cont['basis_functions'], max_action=cont.get('max_action', 1.0)) elif cont['type'] == 'linear': controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=cont.get('max_action', 1.0)) else: ValueError('Invalid Controller') if rew is None: reward = None elif rew['type'] == 'exp': reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W']) else: ValueError('This function only handles Exponential rewards for now') pilco = PILCO((X, Y), controller=controller, reward=reward, horizon=plan_timesteps, m_init=m_init, S_init=S_init) if fixed_noise is not None: for model in pilco.mgpr.models: model.likelihood.variance.assign(fixed_noise) set_trainable(model.likelihood.variance, False) evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) if name == '': from datetime import datetime current_time = datetime.now() name = current_time.strftime("%d_%m_%Y_%H_%M_%S") for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models() pilco.optimize_policy(maxiter=maxiter, restarts=restarts) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=sim_timesteps, SUBS=SUBS, verbose=True) cur_rew = 0 X = np.vstack((X, X_new[:plan_timesteps, :])) Y = np.vstack((Y, Y_new[:plan_timesteps, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0 and k == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists("results/" + name): os.makedirs("results/" + name) np.savetxt("results/" + name + "X_" + str(seed) + ".csv", X, delimiter=',') np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_sampled_" + str(seed) + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_full_" + str(seed) + ".csv", evaluation_returns_full, delimiter=',')
state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim # m_init = np.transpose(X[0,:-1,None])# m_init = np.transpose(X[0,:-1,None]) S_init = 0.05 * np.eye(state_dim) np.save(main_path+"/SAVED/v4_data_mean_5", data_mean) np.save(main_path+"/SAVED/v4_data_std_5", data_std) W = np.diag(np.array([1.0,0.5,0.1,0.1])) # Weight of states T_sim = 600 target = np.divide([0.0,0.0,0.0,0.0] -data_mean,data_std) R, controller = pilcotrac.get_RewardFun_and_Controller(state_dim,control_dim,data_mean,data_std,max_yerror,W) pilco = PILCO((X, Y), num_induced_points=100,controller=controller, horizon=T,reward=R,m_init=m_init,S_init=S_init) # num_induced_points: Number of sparse GP-pseudo-points for model in pilco.mgpr.models: # model.clear() # model.kern.lengthscales.prior = gpflow.priors.Gamma(3,20) # priors have to be included before the model gets compiled # model.kern.variance.prior = gpflow.priors.Gamma(1.5,4) model.likelihood.variance.assign(0.05) # take a large noise to improve numerical stability (Cholesky decomposition failures) set_trainable(model.likelihood.variance, False) # model.compile() # vals = model.read_values() # print(vals)
render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, verbose=False, random=True, SUBS=SUBS, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=10, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) pilco.optimize_models(maxiter=100) pilco.optimize_policy(maxiter=20) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T, SUBS=SUBS, render=False)
class PilcoDynamicsLearner(DynamicsLearnerInterface): def __init__(self, history_length, prediction_horizon, ninducing_points, nsampled_training_points=None, averaging=True): super().__init__(history_length, prediction_horizon, averaging=averaging) self.ninducing_points = ninducing_points self.nsampled_training_points = nsampled_training_points def _learn(self, training_inputs, training_targets): assert self.ninducing_points # Subsampling the data if required. if self.nsampled_training_points: training_inputs, training_targets = self._subsample_training_set( training_inputs, training_targets) # Full GP if no inducing points are provided. self.pilco_ = PILCO(training_inputs, training_targets, self.ninducing_points) self.pilco_.optimize_models(disp=True) def _predict(self, inputs): assert self.pilco_, "a trained model must be available" prediction = [] for model in self.pilco_.mgpr.models: means, _ = model.predict_f(inputs) prediction.append(means) return np.hstack(prediction) def _subsample_training_set(self, training_inputs, training_targets): assert self.nsampled_training_points total_size = training_inputs.shape[0] permutation = np.random.permutation( total_size)[:self.nsampled_training_points] return training_inputs[permutation], training_targets[permutation] def name(self): return "pilco" def load(self, filename): params_dict = np.load(filename) for k in params_dict.keys(): print(k, params_dict[k].shape) raise NotImplementedError # TODO: parse the hyperparameters. def save(self, filename): """ Stores the hyperparameters of the GPs which includes inducing points in the case of sparse approximations. """ params_dict = defaultdict(list) for model in self.pilco_.mgpr.models: params = model.read_trainables() for key in params.keys(): params_dict[key].append(params[key]) np.savez(filename, **params_dict)
X.append(np.hstack((x, u))) Y.append(x_new - x) x = x_new return np.stack(X), np.stack(Y) # Initial random rollouts to generate a dataset def random_policy(x): return env.action_space.sample()/5 X1, Y1 = rollout(policy=random_policy, timesteps=40) X2, Y2 = rollout(policy=random_policy, timesteps=40) X = np.vstack((X1, X2)) Y = np.vstack((Y1, Y2)) # Define PILCO on three rollouts pilco = PILCO(X, Y, horizon=50) # Example of fixing a parameter pilco.controller.b = np.array([[0.0]]) pilco.controller.b.trainable = False def pilco_policy(x): return pilco.compute_action(x[None, :])[0, :] for rollouts in range(2): pilco.optimize() import pdb; pdb.set_trace() X_new, Y_new = rollout(policy=pilco_policy, timesteps=150) # Update dataset X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new)) pilco.mgpr.set_XY(X, Y)
def swimmer_run(name, seed): env = gym.make('Swimmer-v2').env #env = SwimmerWrapper() state_dim = 8 control_dim = 2 SUBS = 5 maxiter = 120 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.005 * np.eye(state_dim) J = 10 N = 15 T = 15 bf = 40 T_sim = 50 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[3] = 1.0 max_ang = 95 / 180 * np.pi t1 = np.zeros(state_dim) t1[2] = max_ang w1 = 1e-6 * np.eye(state_dim) w1[2, 2] = 10 t2 = np.zeros(state_dim) t2[1] = max_ang w2 = 1e-6 * np.eye(state_dim) w2[1, 1] = 10 t5 = np.zeros(state_dim) #t5[0] = max_ang #w3 = 1e-6 * np.eye(state_dim) #w3[0,0] = 5 t3 = np.zeros(state_dim) t3[2] = -max_ang t4 = np.zeros(state_dim) t4[1] = -max_ang #t6 = np.zeros(state_dim); t6[0] = -max_ang R2 = LinearReward(state_dim, weights_l) R3 = ExponentialReward(state_dim, W=w1, t=t1) R4 = ExponentialReward(state_dim, W=w2, t=t2) R5 = ExponentialReward(state_dim, W=w1, t=t3) R6 = ExponentialReward(state_dim, W=w2, t=t4) #R7 = ExponentialReward(state_dim, W=w3, t=t5) #R8 = ExponentialReward(state_dim, W=w3, t=t6) Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6], coefs=[1.0, -1.0, -1.0, -1.0, -1.0]) # Rew = R2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=Rew, m_init=m_init, S_init=S_init) #for model in pilco.mgpr.models: # model.likelihood.variance = 0.0001 # model.likelihood.variance.trainable = False logging = True # To save results in .csv files turn this flag to True eval_runs = 10 evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) eval_max_timesteps = 1000 // SUBS X_eval = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=False) cur_rew = 0 for t in range(0, len(X_new)): cur_rew += Rew.compute_reward( X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0] if t == T: print( 'On this episode, on the planning horizon, PILCO reward was: ', cur_rew) print('On this episode PILCO reward was ', cur_rew) gym_steps = 1000 T_eval = gym_steps // SUBS # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists(name): os.makedirs(name) np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',') np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',') np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt(name + "evaluation_returns_full_" + seed + ".csv", evaluation_returns_full, delimiter=',')
random=True, timesteps=T_sim, render=False, verbose=verbose) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean # controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=40) controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO((X, Y), controller=controller, horizon=T, m_init=m_init) pilco.controller.max_action = e # for numerical stability for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) # model.likelihood.fixed=True return_lst = [] task_length = [] for rollouts in range(100): print("**** ITERATION no.", rollouts, " ****") try: pilco.optimize_models(maxiter=maxiter) pilco.optimize_policy(maxiter=maxiter)
X = [] Y = [] for rollouts in range(5): env.reset() x, _, _, _ = env.step(0) for i in range(50): env.render() u = env.action_space.sample() / 5 x_, _, _, _ = env.step(u) X.append(np.hstack((x, u))) Y.append(x_ - x) x = x_ pilco = PILCO(np.stack(X), np.stack(Y)) pilco.controller.t.trainable = False pilco.controller.b.trainable = False pilco.optimize() import pdb pdb.set_trace() env.reset() x, _, _, _ = env.step(0) for i in range(50): env.render() u = pilco.compute_action(x[None, :]) x_, _, _, _ = env.step(u) X.append(np.hstack((x, u[0, :]))) Y.append(x_ - x)