def test_rbf(): np.random.seed(0) d = 3 # Input dimension k = 2 # Number of outputs b = 100 # basis functions # Training Dataset X0 = np.random.rand(100, d) A = np.random.rand(d, k) Y0 = np.sin(X0).dot(A) + 1e-3 * (np.random.rand(100, k) - 0.5 ) # Just something smooth rbf = RbfController(3, 2, b) rbf.set_XY(X0, Y0) # Generate input m = np.random.rand(1, d) # But MATLAB defines it as m' s = np.random.rand(d, d) s = s.dot(s.T) # Make s positive semidefinite M, S, V = rbf.compute_action(m, s, squash=False) print("M\n", M) print("S\n", S) print("V\n", V) # convert data to the struct expected by the MATLAB implementation lengthscales = rbf.model.covar_module.lengthscale.cpu().detach().numpy( ).squeeze() variance = 1 * np.ones(k) noise = rbf.model.likelihood.noise.cpu().detach().numpy().squeeze() hyp = np.log( np.hstack((lengthscales, np.sqrt(variance[:, None]), np.sqrt(noise[:, None])))).T gpmodel = oct2py.io.Struct() gpmodel.hyp = hyp gpmodel.inputs = X0 gpmodel.targets = Y0 # Call gp0 in octave M_mat, S_mat, V_mat = octave.gp2(gpmodel, m.T, s, nout=3) assert M.shape == M_mat.T.shape assert S.shape == S_mat.shape assert V.shape == V_mat.shape np.testing.assert_allclose(M, M_mat.T, rtol=1e-4) np.testing.assert_allclose(S, S_mat, rtol=1e-4) np.testing.assert_allclose(V, V_mat, rtol=1e-4)
def load_and_run_model(env, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) print('Running {:s}'.format(name)) rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') # observation_space = env.observation_space.shape[0] run = 0 while True: run += 1 state = env.reset() step = 0 while True: step += 1 env.render() #TODO RUN PI ADJUST action = utils.policy(env, pilco, state, False) # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + str(run) + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def plot_pilco_source_learning_curve(): env = gym.make('continuous-cartpole-v0') env.seed(73) pilcos = ['initial'] + [str(i) for i in range(6)] rewards = [] for i, p in enumerate(pilcos): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 xs = [] angles = [] while True: xs.append(state[0]) angles.append(state[2]) step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) plt.plot(xs, angles) plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight") plt.close() env.close() plt.plot([i for i, _ in enumerate(pilcos)], rewards) plt.savefig('pilco_rewards_plot', bbox_inches="tight") plt.close() return rewards, xs, angles
def piadjust(NT, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env_S = gym.make('continuous-cartpole-v0') env_S.seed(73) env_T = gym.make('continuous-cartpole-v99') env_T.seed(73) D_S = sampler(pilco, env_S, samples_n=30, trials=50) # print('D_S sampling done') D_T = None pi_adj = pilco for i in range(NT): print('{:d}/{:d}'.format(i + 1, NT)) D_adj = [] if i == 0: D_i_T = sampler(pilco, env_T, samples_n=30) elif i != 0: D_i_T = sampler_adj(pi_adj, pilco, env_T, 30) if D_T is not None: D_T = np.concatenate((D_i_T, D_T)) elif D_T is None: D_T = D_i_T # print('Going for inverse dyn') gpr = inverse_dyn(D_T) # print('inverse dyn done') for samp in D_S: x_s = list(samp[0]) x_s1 = list(samp[2]) u_t_S = samp[1] a = np.array(x_s + x_s1).reshape(1, 8) u_t_T = gpr.predict(a, return_std=False) D_adj.append((x_s, u_t_S, u_t_T - u_t_S)) # print('Going for L3') pi_adj = L3(D_adj) # print('L3 Done') # i = i + 1 # if (i % 1 == 0): save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i)) env_S.env.close() env_T.env.close() return pi_adj
def test_rbf(): np.random.seed(0) d = 3 # Input dimension k = 2 # Number of outputs b = 100 # basis functions # Training Dataset X0 = np.random.rand(100, d) A = np.random.rand(d, k) Y0 = np.sin(X0).dot(A) + 1e-3 * (np.random.rand(100, k) - 0.5 ) # Just something smooth rbf = RbfController(3, 2, b) rbf.set_XY(X0, Y0) # Generate input m = np.random.rand(1, d) # But MATLAB defines it as m' s = np.random.rand(d, d) s = s.dot(s.T) # Make s positive semidefinite M, S, V = compute_action_wrapper(rbf, m, s) # convert data to the struct expected by the MATLAB implementation lengthscales = np.stack( [model.kern.lengthscales.value for model in rbf.models]) variance = np.stack([model.kern.variance.value for model in rbf.models]) noise = np.stack([model.likelihood.variance.value for model in rbf.models]) hyp = np.log( np.hstack((lengthscales, np.sqrt(variance[:, None]), np.sqrt(noise[:, None])))).T gpmodel = oct2py.io.Struct() gpmodel.hyp = hyp gpmodel.inputs = X0 gpmodel.targets = Y0 # Call gp0 in octave M_mat, S_mat, V_mat = octave.gp2(gpmodel, m.T, s, nout=3) assert M.shape == M_mat.T.shape assert S.shape == S_mat.shape assert V.shape == V_mat.shape np.testing.assert_allclose(M, M_mat.T, rtol=1e-4) np.testing.assert_allclose(S, S_mat, rtol=1e-4) np.testing.assert_allclose(V, V_mat, rtol=1e-4)
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5', transfer_name='{:d}true_dyn_pi_adj.pkl', adjust=True): env = gym.make('continuous-cartpole-v99') env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco(pilco_name, controller=controller, reward=R, sparse=False) rewards = [] for i in range(10): print('Running {:s}'.format(transfer_name.format(i))) if adjust: with open(transfer_name.format(i), 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if adjust: pi_adjust_action = pi_adjust.predict( np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) env.close() return rewards
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True): env = gym.make(env_name) env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False) if transfer_name is not None: with open(transfer_name, 'rb') as inp2: pi_adjust = pickle.load(inp2) xs = [] angles = [] state = env.reset() for _ in range(1000): xs.append(state[0]) angles.append(state[2]) env.render() u_action = policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if transfer_name is not None: pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) state = state_next if terminal: break env.close() xs = np.array(xs) angles = np.array(angles) plt.plot(xs, angles) plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2) plt.xlabel('position') plt.ylabel('angle') plt.title(fig_title) plt.xlim(-0.2, 0.2) plt.ylim(-0.2, 0.2) if save: plt.savefig(fig_file_name, bbox_inches="tight") plt.close()
def souce_loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env = gym.make('continuous-cartpole-v99') pi_adjust = None score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 # env.render() # TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() return (avg_reward)
def true_loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) with open('9true_dyn_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) # with open('10_pi_adj.pkl', 'rb') as inp2: # good_pi = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 while True: run += 1 state = env.reset() # print(state) # input() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] state_next, reward, terminal, info = env.step(action + u_action) reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def get_RewardFun_and_Controller(self, state_dim, control_dim, data_mean, data_std, max_yerror, W): # t1 = np.divide([max_yerror,0.0,0.0,0.0] - data_mean,data_std) # t2 = np.divide([-max_yerror,0.0,0.0,0.0] - data_mean,data_std) # R1 = ExponentialReward(state_dim, W, t=t1) # R2 = ExponentialReward(state_dim, W, t=t2) target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std) Re = ExponentialReward(state_dim=state_dim, t=target, W=W) # R = CombinedRewards(state_dim, [Re,R1,R2], coefs=[2.0,-1.0,-1.0]) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=50) # num_basis_functions: number of RBFuntions, a large number of functions is flexber but increases computational demands. # If v==4: num_basis_functions=50 return Re, controller
if __name__ == '__main__': bf = 10 max_action = 1.0 target = np.array([0., 0., 0., 0.]) weights = np.diag([0.5, 0.1, 0.5, 0.25]) T = 400 state_dim = 4 control_dim = 1 with tf.Session() as sess: env = gym.make('continuous-cartpole-v0') controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-5', controller=controller, reward=R, sparse=False) pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :] env_s = gym.make('continuous-cartpole-v0') env_t = gym.make('continuous-cartpole-v1') padjust(env_s, env_t, pilco)
# Initial random rollouts to generate a dataset X, Y, _ = rollout(env=env, pilco=None, random=True, timesteps=40) random_rewards = [] for i in range(1, 3): X_, Y_, rewards = rollout(env=env, pilco=None, random=True, timesteps=40) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) random_rewards.append(sum(rewards)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) # controller = LinearController(state_dim=state_dim, control_dim=control_dim) pilco = PILCO(X, Y, controller=controller, horizon=40) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0])) # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R) # Example of fixing a parameter, optional, for a linear controller only # pilco.controller.b = np.array([[0.0]]) # pilco.controller.b.trainable = False for rollouts in range(3): pilco.optimize_models() pilco.optimize_policy()
def safe_swimmer_run(seed=0, logging=False): env = SwimmerWrapper() state_dim = 9 control_dim = 2 SUBS = 2 maxiter = 60 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.05 * np.eye(state_dim) J = 1 N = 12 T = 25 bf = 30 T_sim = 100 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[0] = 0.5 max_ang = (100 / 180 * np.pi) * 0.95 R1 = LinearReward(state_dim, weights_l) C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False) C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False) C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False) R = CombinedRewards(state_dim, [R1, C1, C2, C3], coefs=[1.0, -10.0, -10.0, -10.0]) th = 0.2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) new_data = True eval_runs = T_sim evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=1, restarts=2) m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risk1 = np.zeros(T) predicted_risk2 = np.zeros(T) predicted_risk3 = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risk1[h], _ = C1.compute_reward(m_h, S_h) predicted_risk2[h], _ = C2.compute_reward(m_h, S_h) predicted_risk3[h], _ = C3.compute_reward(m_h, S_h) estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1) estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2) estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3) overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * ( 1 - estimate_risk3) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) new_data = True # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if estimate_risk1 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0]) if estimate_risk2 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0]) if estimate_risk3 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75]) else: print("*********CHANGING***********") if estimate_risk1 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0]) if estimate_risk2 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0]) if estimate_risk3 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5]) _, _, r = pilco.predict(m_init, S_init, T)
def safe_cars(name='', seed=0, logging=False): T = 25 th = 0.05 np.random.seed(seed) name = name J = 5 N = 8 eval_runs = 5 env = LinearCars() # Initial random rollouts to generate a dataset X1, Y1, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1,5): X1_, Y1_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X1 = np.vstack((X1, X1_)) Y1 = np.vstack((Y1, Y1_)) env = Normalised_Env(env, np.mean(X1[:,:4],0), np.std(X1[:,:4], 0), gym_env=False) X, Y, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1,J): X_, Y_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.transpose(X[0,:-1,None]) S_init = 0.1 * np.eye(state_dim) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=40, max_action=0.2) #w1 = np.diag([1.5, 0.001, 0.001, 0.001]) #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std) #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1) # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0])) R1 = LinearReward(state_dim=state_dim, W=np.array([1.0 * env.std[0], 0., 0., 0,])) # transforming the bounds [-1,1] to the new space after normalisation bound_x1 = 1 / env.std[0] bound_x2 = 1 / env.std[2] B = RiskOfCollision(2, [-bound_x1-env.m[0]/env.std[0], -bound_x2 - env.m[2]/env.std[2]], [bound_x1 - env.m[0]/env.std[0], bound_x2 - env.m[2]/env.std[2]]) pilco = SafePILCO((X, Y), controller=controller, mu=-300.0, reward_add=R1, reward_mult=B, horizon=T, m_init=m_init, S_init=S_init) # define tolerance new_data = True # init = tf.global_variables_initializer() evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("***ITERATION**** ", rollouts) if new_data: pilco.optimize_models(maxiter=100, restarts=2) new_data = False pilco.optimize_policy(maxiter=20, restarts=5) # check safety m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risks = np.zeros(T) predicted_rewards = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h,:], S_p[h,:,:] = m_h[:], S_h[:,:] predicted_risks[h], _ = B.compute_reward(m_h, S_h) predicted_rewards[h], _ = R1.compute_reward(m_h, S_h) overall_risk = 1 - np.prod(1.0-predicted_risks) print("Predicted episode's return: ", sum(predicted_rewards)) print("Overall risk ", overall_risk) print("Mu is ", pilco.mu.numpy()) print("bound1 ", bound_x1, " bound1 ", bound_x2) #print("No of ops:", len(tf.get_default_graph().get_operations())) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) new_data = True X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new)) pilco.mgpr.set_data((X, Y)) if overall_risk < (th/4): pilco.mu.assign(0.75 * pilco.mu.numpy()) if logging: for k in range(0, eval_runs): [X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k]] = rollout(env, pilco, timesteps=T, verbose=False, SUBS=1, render=False) if len(X_eval)==0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) np.savetxt("results/" + "X_"+ str(seed) + ".csv", X, delimiter=',') np.savetxt("results/" + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',') np.savetxt("results/" + "evaluation_returns_sampled_" + str(seed) + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt("results/" + "evaluation_returns_full_" + str(seed) + ".csv", evaluation_returns_full, delimiter=',') else: X_new, Y_new,_,_ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) print(m_p[:,0] - X_new[:,0]) print(m_p[:,2] - X_new[:,2]) print("*********CHANGING***********") _, _, r = pilco.predict(m_init, S_init, T) print(r) # to verify this actually changes, run the reward wrapper before and after on the same trajectory pilco.mu.assign(1.5 * pilco.mu.numpy()) _, _, r = pilco.predict(m_init, S_init, T) print(r)
def loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) for pick in range(1, 11): env = gym.make('continuous-cartpole-v99') with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 #env.render() #TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1)) action = action[0] if action[0] > 1: action[0] = 1 elif action[0] < -1: action[0] = -1 # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() Rs[0][pick - 1] = avg_reward return (Rs)
def safe_cars(seed=0): T = 25 th = 0.10 np.random.seed(seed) J = 5 N = 5 eval_runs = 5 env = LinearCars() # Initial random rollouts to generate a dataset X1, Y1, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1, 5): X1_, Y1_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X1 = np.vstack((X1, X1_)) Y1 = np.vstack((Y1, Y1_)) env = Normalised_Env(np.mean(X1[:, :4], 0), np.std(X1[:, :4], 0)) X, Y, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.transpose(X[0, :-1, None]) S_init = 0.1 * np.eye(state_dim) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=40, max_action=0.2) #w1 = np.diag([1.5, 0.001, 0.001, 0.001]) #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std) #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1) # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0])) R1 = LinearReward(state_dim=state_dim, W=np.array([ 1.0 * env.std[0], 0., 0., 0, ])) bound_x1 = 1 / env.std[0] bound_x2 = 1 / env.std[2] B = RiskOfCollision( 2, [-bound_x1 - env.m[0] / env.std[0], -bound_x2 - env.m[2] / env.std[2]], [bound_x1 - env.m[0] / env.std[0], bound_x2 - env.m[2] / env.std[2]]) pilco = SafePILCO((X, Y), controller=controller, mu=-300.0, reward_add=R1, reward_mult=B, horizon=T, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) # define tolerance new_data = True # init = tf.global_variables_initializer() evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("***ITERATION**** ", rollouts) if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=20, restarts=2) # check safety m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risks = np.zeros(T) predicted_rewards = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risks[h], _ = B.compute_reward(m_h, S_h) predicted_rewards[h], _ = R1.compute_reward(m_h, S_h) overall_risk = 1 - np.prod(1.0 - predicted_risks) print("Predicted episode's return: ", sum(predicted_rewards)) print("Overall risk ", overall_risk) print("Mu is ", pilco.mu.numpy()) print("bound1 ", bound_x1, " bound1 ", bound_x2) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) new_data = True X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_data((X, Y)) if overall_risk < (th / 4): pilco.mu.assign(0.75 * pilco.mu.numpy()) else: X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False) print(m_p[:, 0] - X_new[:, 0]) print(m_p[:, 2] - X_new[:, 2]) print("*********CHANGING***********") _, _, r = pilco.predict(m_init, S_init, T) print(r) # to verify this actually changes, run the reward wrapper before and after on the same trajectory pilco.mu.assign(1.5 * pilco.mu.numpy()) _, _, r = pilco.predict(m_init, S_init, T) print(r)
def swimmer_run(name, seed): env = gym.make('Swimmer-v2').env #env = SwimmerWrapper() state_dim = 8 control_dim = 2 SUBS = 5 maxiter = 120 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.005 * np.eye(state_dim) J = 10 N = 15 T = 15 bf = 40 T_sim = 50 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[3] = 1.0 max_ang = 95 / 180 * np.pi t1 = np.zeros(state_dim) t1[2] = max_ang w1 = 1e-6 * np.eye(state_dim) w1[2, 2] = 10 t2 = np.zeros(state_dim) t2[1] = max_ang w2 = 1e-6 * np.eye(state_dim) w2[1, 1] = 10 t5 = np.zeros(state_dim) #t5[0] = max_ang #w3 = 1e-6 * np.eye(state_dim) #w3[0,0] = 5 t3 = np.zeros(state_dim) t3[2] = -max_ang t4 = np.zeros(state_dim) t4[1] = -max_ang #t6 = np.zeros(state_dim); t6[0] = -max_ang R2 = LinearReward(state_dim, weights_l) R3 = ExponentialReward(state_dim, W=w1, t=t1) R4 = ExponentialReward(state_dim, W=w2, t=t2) R5 = ExponentialReward(state_dim, W=w1, t=t3) R6 = ExponentialReward(state_dim, W=w2, t=t4) #R7 = ExponentialReward(state_dim, W=w3, t=t5) #R8 = ExponentialReward(state_dim, W=w3, t=t6) Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6], coefs=[1.0, -1.0, -1.0, -1.0, -1.0]) # Rew = R2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=Rew, m_init=m_init, S_init=S_init) #for model in pilco.mgpr.models: # model.likelihood.variance = 0.0001 # model.likelihood.variance.trainable = False logging = True # To save results in .csv files turn this flag to True eval_runs = 10 evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) eval_max_timesteps = 1000 // SUBS X_eval = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=False) cur_rew = 0 for t in range(0, len(X_new)): cur_rew += Rew.compute_reward( X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0] if t == T: print( 'On this episode, on the planning horizon, PILCO reward was: ', cur_rew) print('On this episode PILCO reward was ', cur_rew) gym_steps = 1000 T_eval = gym_steps // SUBS # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists(name): os.makedirs(name) np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',') np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',') np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt(name + "evaluation_returns_full_" + seed + ".csv", evaluation_returns_full, delimiter=',')
X_, Y_ = rollout(env=env, pilco=None, random=True, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) for i in range(1,24): #Gaussian/Normal distribution action sampling X_, Y_ = rollout(env=env, pilco=None, random="Normal", timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) for i in range(1,4): #No action sampling; u := 0 X_, Y_ = rollout(env=env, pilco=None, random=None, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) #print(X) #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72) # Example of user provided reward function, setting a custom target state R = ExponentialReward(state_dim=state_dim, t=target) pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, num_induced_points=72, m_init=m_init) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False plot_model = False # Plot model set to false until iteration 5 is reached for rollouts in range(20): print("Optimizing models")
def pilco_run(env, N, J, safe=False, name='', seed=0, cont=None, rew=None, SUBS=1, sim_timesteps=50, plan_timesteps=30, restarts=1, maxiter=100, m_init=None, S_init=None, fixed_noise=None, logging=False, eval_runs=5, eval_max_timesteps=None, variable_episode_length=False): np.random.seed(seed) X, Y, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) for i in range(1, J): X_, Y_, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim if cont is None: controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) elif cont['type'] == 'rbf': controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=cont['basis_functions'], max_action=cont.get('max_action', 1.0)) elif cont['type'] == 'linear': controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=cont.get('max_action', 1.0)) else: ValueError('Invalid Controller') if rew is None: reward = None elif rew['type'] == 'exp': reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W']) else: ValueError('This function only handles Exponential rewards for now') pilco = PILCO((X, Y), controller=controller, reward=reward, horizon=plan_timesteps, m_init=m_init, S_init=S_init) if fixed_noise is not None: for model in pilco.mgpr.models: model.likelihood.variance.assign(fixed_noise) set_trainable(model.likelihood.variance, False) evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) if name == '': from datetime import datetime current_time = datetime.now() name = current_time.strftime("%d_%m_%Y_%H_%M_%S") for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models() pilco.optimize_policy(maxiter=maxiter, restarts=restarts) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=sim_timesteps, SUBS=SUBS, verbose=True) cur_rew = 0 X = np.vstack((X, X_new[:plan_timesteps, :])) Y = np.vstack((Y, Y_new[:plan_timesteps, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0 and k == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists("results/" + name): os.makedirs("results/" + name) np.savetxt("results/" + name + "X_" + str(seed) + ".csv", X, delimiter=',') np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_sampled_" + str(seed) + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_full_" + str(seed) + ".csv", evaluation_returns_full, delimiter=',')