control_dim=control_dim, num_basis_functions=5) # controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO(X, Y, controller=controller, horizon=40) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0])) # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R) # Example of fixing a parameter, optional, for a linear controller only # pilco.controller.b = np.array([[0.0]]) # pilco.controller.b.trainable = False for rollouts in range(3): pilco.optimize_models() pilco.optimize_policy() X_new, Y_new, _ = rollout(env=env, pilco=pilco, timesteps=100) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_XY(X, Y) input("done training! Press enter to evaluate the policy") trained_rewards = [] for i in range(1, 100): X_, Y_, rewards = rollout(env=env, pilco=pilco, timesteps=100) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) trained_rewards.append(sum(rewards))
pilco = PILCO((X, Y), controller=controller, horizon=T, m_init=m_init) pilco.controller.max_action = e # for numerical stability for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) # model.likelihood.fixed=True return_lst = [] task_length = [] for rollouts in range(100): print("**** ITERATION no.", rollouts, " ****") try: pilco.optimize_models(maxiter=maxiter) pilco.optimize_policy(maxiter=maxiter) except: # for i in range(len(return_lst)): # return_lst[i] = str(return_lst[i]) # df = pd.DataFrame(return_lst, columns=['Return per epoch']) # df.to_csv(('/home/lab/Github/PILCO/log/Return/{}.csv'.format(time.time()))) print('Start Error!!!!!!!!!!!!!!!!!!') X_new, Y_new, _, ep_return_lst, ep_length = rollout(env=env, pilco=pilco, timesteps=1000, render=False, verbose=verbose) return_lst.append(ep_return_lst) task_length.append(ep_length) # Update dataset
def safe_swimmer_run(seed=0, logging=False): env = SwimmerWrapper() state_dim = 9 control_dim = 2 SUBS = 2 maxiter = 60 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.05 * np.eye(state_dim) J = 1 N = 12 T = 25 bf = 30 T_sim = 100 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[0] = 0.5 max_ang = (100 / 180 * np.pi) * 0.95 R1 = LinearReward(state_dim, weights_l) C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False) C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False) C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False) R = CombinedRewards(state_dim, [R1, C1, C2, C3], coefs=[1.0, -10.0, -10.0, -10.0]) th = 0.2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) new_data = True eval_runs = T_sim evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=1, restarts=2) m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risk1 = np.zeros(T) predicted_risk2 = np.zeros(T) predicted_risk3 = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risk1[h], _ = C1.compute_reward(m_h, S_h) predicted_risk2[h], _ = C2.compute_reward(m_h, S_h) predicted_risk3[h], _ = C3.compute_reward(m_h, S_h) estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1) estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2) estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3) overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * ( 1 - estimate_risk3) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) new_data = True # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if estimate_risk1 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0]) if estimate_risk2 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0]) if estimate_risk3 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75]) else: print("*********CHANGING***********") if estimate_risk1 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0]) if estimate_risk2 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0]) if estimate_risk3 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5]) _, _, r = pilco.predict(m_init, S_init, T)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) # for numerical stability for model in pilco.mgpr.models: # model.kern.lengthscales.prior = gpflow.priors.Gamma(1,10) priors have to be included before # model.kern.variance.prior = gpflow.priors.Gamma(1.5,2) before the model gets compiled model.likelihood.variance = 0.001 model.likelihood.variance.trainable = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=maxiter, restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) # Since we had decide on the various parameters of the reward function # we might want to verify that it behaves as expected by inspection # cur_rew = 0 # for t in range(0,len(X_new)): # cur_rew += reward_wrapper(R, X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0] # print('On this episode reward was ', cur_rew) # Update dataset X = np.vstack((X, X_new[:T, :])); Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_XY(X, Y) lens.append(len(X_new))
control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=10, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) pilco.optimize_models(maxiter=100) pilco.optimize_policy(maxiter=20) # Rollout using the pilco controller X_new, Y_new = rollout(env, pilco, timesteps=T, SUBS=SUBS, render=False) for i, m in enumerate(pilco.mgpr.models): y_pred_test, var_pred_test = m.predict_y(X_new) plt.plot(range(len(y_pred_test)), y_pred_test, Y_new[:, i]) plt.fill_between(range(len(y_pred_test)), y_pred_test[:, 0] - 2 * np.sqrt(var_pred_test[:, 0]), y_pred_test[:, 0] + 2 * np.sqrt(var_pred_test[:, 0]), alpha=0.3) plt.show() np.shape(var_pred_test)
def swimmer_run(name, seed): env = gym.make('Swimmer-v2').env #env = SwimmerWrapper() state_dim = 8 control_dim = 2 SUBS = 5 maxiter = 120 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.005 * np.eye(state_dim) J = 10 N = 15 T = 15 bf = 40 T_sim = 50 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[3] = 1.0 max_ang = 95 / 180 * np.pi t1 = np.zeros(state_dim) t1[2] = max_ang w1 = 1e-6 * np.eye(state_dim) w1[2, 2] = 10 t2 = np.zeros(state_dim) t2[1] = max_ang w2 = 1e-6 * np.eye(state_dim) w2[1, 1] = 10 t5 = np.zeros(state_dim) #t5[0] = max_ang #w3 = 1e-6 * np.eye(state_dim) #w3[0,0] = 5 t3 = np.zeros(state_dim) t3[2] = -max_ang t4 = np.zeros(state_dim) t4[1] = -max_ang #t6 = np.zeros(state_dim); t6[0] = -max_ang R2 = LinearReward(state_dim, weights_l) R3 = ExponentialReward(state_dim, W=w1, t=t1) R4 = ExponentialReward(state_dim, W=w2, t=t2) R5 = ExponentialReward(state_dim, W=w1, t=t3) R6 = ExponentialReward(state_dim, W=w2, t=t4) #R7 = ExponentialReward(state_dim, W=w3, t=t5) #R8 = ExponentialReward(state_dim, W=w3, t=t6) Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6], coefs=[1.0, -1.0, -1.0, -1.0, -1.0]) # Rew = R2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=Rew, m_init=m_init, S_init=S_init) #for model in pilco.mgpr.models: # model.likelihood.variance = 0.0001 # model.likelihood.variance.trainable = False logging = True # To save results in .csv files turn this flag to True eval_runs = 10 evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) eval_max_timesteps = 1000 // SUBS X_eval = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=False) cur_rew = 0 for t in range(0, len(X_new)): cur_rew += Rew.compute_reward( X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0] if t == T: print( 'On this episode, on the planning horizon, PILCO reward was: ', cur_rew) print('On this episode PILCO reward was ', cur_rew) gym_steps = 1000 T_eval = gym_steps // SUBS # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists(name): os.makedirs(name) np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',') np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',') np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt(name + "evaluation_returns_full_" + seed + ".csv", evaluation_returns_full, delimiter=',')
def pilco_run(env, N, J, safe=False, name='', seed=0, cont=None, rew=None, SUBS=1, sim_timesteps=50, plan_timesteps=30, restarts=1, maxiter=100, m_init=None, S_init=None, fixed_noise=None, logging=False, eval_runs=5, eval_max_timesteps=None, variable_episode_length=False): np.random.seed(seed) X, Y, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) for i in range(1, J): X_, Y_, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim if cont is None: controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) elif cont['type'] == 'rbf': controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=cont['basis_functions'], max_action=cont.get('max_action', 1.0)) elif cont['type'] == 'linear': controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=cont.get('max_action', 1.0)) else: ValueError('Invalid Controller') if rew is None: reward = None elif rew['type'] == 'exp': reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W']) else: ValueError('This function only handles Exponential rewards for now') pilco = PILCO((X, Y), controller=controller, reward=reward, horizon=plan_timesteps, m_init=m_init, S_init=S_init) if fixed_noise is not None: for model in pilco.mgpr.models: model.likelihood.variance.assign(fixed_noise) set_trainable(model.likelihood.variance, False) evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) if name == '': from datetime import datetime current_time = datetime.now() name = current_time.strftime("%d_%m_%Y_%H_%M_%S") for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models() pilco.optimize_policy(maxiter=maxiter, restarts=restarts) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=sim_timesteps, SUBS=SUBS, verbose=True) cur_rew = 0 X = np.vstack((X, X_new[:plan_timesteps, :])) Y = np.vstack((Y, Y_new[:plan_timesteps, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0 and k == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists("results/" + name): os.makedirs("results/" + name) np.savetxt("results/" + name + "X_" + str(seed) + ".csv", X, delimiter=',') np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_sampled_" + str(seed) + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_full_" + str(seed) + ".csv", evaluation_returns_full, delimiter=',')
all_Rs[i,0] = R.compute_reward(X[i,None,:-1], 0.001 * np.eye(state_dim))[0] # Reward for each step total_rewards = np.zeros((len(X)//T,1)) ep_rewards = np.zeros((len(X)//T,1)) for i in range(len(total_rewards)): total_rewards[i] = sum(all_Rs[i * T: i*T + T]) ep_rewards[i] = sum(all_Rs[i * T: i*T + T]) for rollouts in range(1,31): # Optimazation print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=2000,restarts=3) # Restart to avoid local minima pilco.optimize_policy(maxiter=2000,restarts=3) # Initialize robot_state.set_pose(x=offsetPath[0][0], y=offsetPath[1][0], yaw=offsetPath[2][0], yawRate = 0.0) robot_state.set_steerAngle(steerAngle=0.0) pathHandler.reset__index_nn() # Testing X_new, Y_new ,j = pilcotrac.rollout(pilco,max_yerror,data_mean=data_mean,data_std=data_std,lookahead= 1.22,timesteps=T_sim,random=False, verbose=False) r_new = np.zeros((len(X_new), 1)) var_r = np.zeros((len(X_new), 1)) r_tar = np.zeros((len(X_new), 1)) for i in range(len(X_new)): r_new[i, 0],var_r[i,0] = R.compute_reward(X_new[i,None,:-1], S_init) r_tar[i, 0] = R.compute_reward(target[None, :], S_init)[0]
X = X.values Y = Y.values print('startk = ', startk) print('X.shape = ', X.shape) print('Y.shape =', Y.shape) # X = X.T; Y = Y.T T = int(30 * 2) m_init = np.reshape([0.0, 0.0, 0.0, 0.0], (1, 4)) S_init = np.diag([0.01, 0.01, 0.01, 0.01]) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=10) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco_model = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) pilco_model.optimize_models(maxiter=100) pilco_model.optimize_policy(maxiter=20) with open(out_fname, 'wb') as wf: frozen_model = gpflow.utilities.freeze(pilco_model) dill.dump(frozen_model, wf)