def plot_pilco_source_learning_curve(): env = gym.make('continuous-cartpole-v0') env.seed(73) pilcos = ['initial'] + [str(i) for i in range(6)] rewards = [] for i, p in enumerate(pilcos): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 xs = [] angles = [] while True: xs.append(state[0]) angles.append(state[2]) step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) plt.plot(xs, angles) plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight") plt.close() env.close() plt.plot([i for i, _ in enumerate(pilcos)], rewards) plt.savefig('pilco_rewards_plot', bbox_inches="tight") plt.close() return rewards, xs, angles
def loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') # observation_space = env.observation_space.shape[0] run = 0 while True: run += 1 state = env.reset() step = 0 while True: step += 1 env.render() #TODO RUN PI ADJUST action = utils.policy(env, pilco, state, False) # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + str(run) + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def get_RewardFun_and_Controller(self, state_dim, control_dim, data_mean, data_std, max_yerror, W): t1 = np.divide([max_yerror, 0.0, 0.0, 0.0] - data_mean, data_std) t2 = np.divide([-max_yerror, 0.0, 0.0, 0.0] - data_mean, data_std) R1 = ExponentialReward(state_dim, W, t=t1) R2 = ExponentialReward(state_dim, W, t=t2) target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std) Re = ExponentialReward(state_dim=state_dim, t=target, W=W) R = CombinedRewards(state_dim, [Re, R1, R2], coefs=[2, -1, -1]) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=30) # num_basis_functions: number of RBFuntions, a large number of functions is flexber but increases computational demands. # If v==4: num_basis_functions=50 return R, controller
def load_and_run_model(env, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) print('Running {:s}'.format(name)) rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def piadjust(NT, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env_S = gym.make('continuous-cartpole-v0') env_S.seed(73) env_T = gym.make('continuous-cartpole-v99') env_T.seed(73) D_S = sampler(pilco, env_S, samples_n=30, trials=50) # print('D_S sampling done') D_T = None pi_adj = pilco for i in range(NT): print('{:d}/{:d}'.format(i + 1, NT)) D_adj = [] if i == 0: D_i_T = sampler(pilco, env_T, samples_n=30) elif i != 0: D_i_T = sampler_adj(pi_adj, pilco, env_T, 30) if D_T is not None: D_T = np.concatenate((D_i_T, D_T)) elif D_T is None: D_T = D_i_T # print('Going for inverse dyn') gpr = inverse_dyn(D_T) # print('inverse dyn done') for samp in D_S: x_s = list(samp[0]) x_s1 = list(samp[2]) u_t_S = samp[1] a = np.array(x_s + x_s1).reshape(1, 8) u_t_T = gpr.predict(a, return_std=False) D_adj.append((x_s, u_t_S, u_t_T - u_t_S)) # print('Going for L3') pi_adj = L3(D_adj) # print('L3 Done') # i = i + 1 # if (i % 1 == 0): save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i)) env_S.env.close() env_T.env.close() return pi_adj
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5', transfer_name='{:d}true_dyn_pi_adj.pkl', adjust=True): env = gym.make('continuous-cartpole-v99') env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco(pilco_name, controller=controller, reward=R, sparse=False) rewards = [] for i in range(10): print('Running {:s}'.format(transfer_name.format(i))) if adjust: with open(transfer_name.format(i), 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if adjust: pi_adjust_action = pi_adjust.predict( np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) env.close() return rewards
def test_reward(): ''' Test reward function by comparing to reward.m ''' k = 2 # state dim m = np.random.rand(1, k) s = np.random.rand(k, k) s = s.dot(s.T) reward = ExponentialReward(k) W = reward.W.numpy() t = reward.t.numpy() M, S = reward.compute_reward(m, s) M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4) np.testing.assert_allclose(M, M_mat) np.testing.assert_allclose(S, S_mat)
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True): env = gym.make(env_name) env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False) if transfer_name is not None: with open(transfer_name, 'rb') as inp2: pi_adjust = pickle.load(inp2) xs = [] angles = [] state = env.reset() for _ in range(1000): xs.append(state[0]) angles.append(state[2]) env.render() u_action = policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if transfer_name is not None: pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) state = state_next if terminal: break env.close() xs = np.array(xs) angles = np.array(angles) plt.plot(xs, angles) plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2) plt.xlabel('position') plt.ylabel('angle') plt.title(fig_title) plt.xlim(-0.2, 0.2) plt.ylim(-0.2, 0.2) if save: plt.savefig(fig_file_name, bbox_inches="tight") plt.close()
def test_reward(): ''' Test reward function by comparing to reward.m ''' k = 2 # state dim m = np.random.rand(1, k) s = np.random.randn(k, k) s = s.dot(s.T) reward = ExponentialReward(k) W = reward.W.data.cpu().numpy() t = reward.t.data.cpu().numpy() M, S = reward.compute_reward( torch.tensor(m).float().cuda(), torch.tensor(s).float().cuda()) M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4) import pdb pdb.set_trace() np.testing.assert_allclose(M.cpu().numpy(), M_mat) np.testing.assert_allclose(S.cpu().numpy(), S_mat)
def souce_loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env = gym.make('continuous-cartpole-v99') pi_adjust = None score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 # env.render() # TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() return (avg_reward)
def true_loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) with open('9true_dyn_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) # with open('10_pi_adj.pkl', 'rb') as inp2: # good_pi = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 while True: run += 1 state = env.reset() # print(state) # input() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] state_next, reward, terminal, info = env.step(action + u_action) reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
timesteps=T, random=True, SUBS=SUBS, verbose=True, render=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) # for numerical stability, we can set the likelihood variance parameters of the GP models for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) r_new = np.zeros((T, 1)) for rollouts in range(N):
def swimmer_run(name, seed): env = gym.make('Swimmer-v2').env #env = SwimmerWrapper() state_dim = 8 control_dim = 2 SUBS = 5 maxiter = 120 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.005 * np.eye(state_dim) J = 10 N = 15 T = 15 bf = 40 T_sim = 50 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[3] = 1.0 max_ang = 95 / 180 * np.pi t1 = np.zeros(state_dim) t1[2] = max_ang w1 = 1e-6 * np.eye(state_dim) w1[2, 2] = 10 t2 = np.zeros(state_dim) t2[1] = max_ang w2 = 1e-6 * np.eye(state_dim) w2[1, 1] = 10 t5 = np.zeros(state_dim) #t5[0] = max_ang #w3 = 1e-6 * np.eye(state_dim) #w3[0,0] = 5 t3 = np.zeros(state_dim) t3[2] = -max_ang t4 = np.zeros(state_dim) t4[1] = -max_ang #t6 = np.zeros(state_dim); t6[0] = -max_ang R2 = LinearReward(state_dim, weights_l) R3 = ExponentialReward(state_dim, W=w1, t=t1) R4 = ExponentialReward(state_dim, W=w2, t=t2) R5 = ExponentialReward(state_dim, W=w1, t=t3) R6 = ExponentialReward(state_dim, W=w2, t=t4) #R7 = ExponentialReward(state_dim, W=w3, t=t5) #R8 = ExponentialReward(state_dim, W=w3, t=t6) Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6], coefs=[1.0, -1.0, -1.0, -1.0, -1.0]) # Rew = R2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=Rew, m_init=m_init, S_init=S_init) #for model in pilco.mgpr.models: # model.likelihood.variance = 0.0001 # model.likelihood.variance.trainable = False logging = True # To save results in .csv files turn this flag to True eval_runs = 10 evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) eval_max_timesteps = 1000 // SUBS X_eval = False for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=False) cur_rew = 0 for t in range(0, len(X_new)): cur_rew += Rew.compute_reward( X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0] if t == T: print( 'On this episode, on the planning horizon, PILCO reward was: ', cur_rew) print('On this episode PILCO reward was ', cur_rew) gym_steps = 1000 T_eval = gym_steps // SUBS # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists(name): os.makedirs(name) np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',') np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',') np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt(name + "evaluation_returns_full_" + seed + ".csv", evaluation_returns_full, delimiter=',')
X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) print('experience collected. ') state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) # pilco = PILCO(X, Y, controller=controller, horizon=40) # Example of user provided reward function, setting a custom target state R = ExponentialReward(state_dim=state_dim, t=np.array([0.1, 0, 0, 0])) pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False for rollouts in range(1): print('optimizing models') pilco.optimize_models() print('optimizing policy') pilco.optimize_policy() # import pdb; pdb.set_trace() X_new, Y_new = rollout(env=env, pilco=pilco, timesteps=100,
max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.005 * np.eye(state_dim) J = 10 N = 15 T = 15 bf = 40 T_sim = 50 # Reward function that dicourages the joints from hitting their max angles max_ang = 95 / 180 * np.pi rewards = [] rewards.append(LinearReward(state_dim, [0, 0, 0, 1.0, 0, 0, 0, 0])) rewards.append( ExponentialReward(state_dim, W=np.diag(np.array([0, 0, 10, 0, 0, 0, 0, 0]) + 1e-6), t=[0, 0, max_ang, 0, 0, 0, 0, 0])) rewards.append( ExponentialReward(state_dim, W=np.diag(np.array([0, 0, 10, 0, 0, 0, 0, 0]) + 1e-6), t=[0, 0, -max_ang, 0, 0, 0, 0, 0])) rewards.append( ExponentialReward(state_dim, W=np.diag(np.array([0, 10, 0, 0, 0, 0, 0, 0]) + 1e-6), t=[0, max_ang, 0, 0, 0, 0, 0, 0])) rewards.append( ExponentialReward(state_dim, W=np.diag(np.array([0, 10, 0, 0, 0, 0, 0, 0]) + 1e-6), t=[0, -max_ang, 0, 0, 0, 0, 0, 0])) combined_reward = CombinedRewards(state_dim, rewards,
np.std(X1[:, :2], 0)) X = np.zeros(X1.shape) X[:, :2] = np.divide(X1[:, :2] - np.mean(X1[:, :2], 0), np.std(X1[:, :2], 0)) X[:, 2] = X1[:, -1] # control inputs are not normalised Y = np.divide(Y1, np.std(X1[:, :2], 0)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim m_init = np.transpose(X[0, :-1, None]) S_init = 0.5 * np.eye(state_dim) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=25) R = ExponentialReward(state_dim=state_dim, t=np.divide([0.5, 0.0] - env.m, env.std), W=np.diag([0.5, 0.1])) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) best_r = 0 all_Rs = np.zeros((X.shape[0], 1)) for i in range(len(all_Rs)): all_Rs[i, 0] = R.compute_reward(X[i, None, :-1], 0.001 * np.eye(state_dim))[0] ep_rewards = np.zeros((len(X) // T, 1))
X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=1.0) if safe: if experim_num == 1: C1 = SingleConstraint(0, high=max_temp, inside=False) else: C1 = SingleConstraint(1, high=max_temp, inside=False) R1 = ExponentialReward(state_dim=state_dim, t=targets) R = CombinedRewards(state_dim, [R1, C1], coefs=[1.0, -15.0]) else: R1 = ExponentialReward(state_dim=state_dim, t=targets, W=weights) t2 = targets.copy() t2[1] = 20.5 w2 = weights.copy() w2[1, 1] = 4.0 R2 = ExponentialReward(state_dim=state_dim, t=t2, W=w2) R = CombinedRewards(state_dim, [R1, R2], coefs=[1.0, -2.0]) pilco = SafePILCO((X, Y), controller=controller, horizon=T, reward_add=R1, reward_mult=C1,
def loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) for pick in range(1, 11): env = gym.make('continuous-cartpole-v99') with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 #env.render() #TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1)) action = action[0] if action[0] > 1: action[0] = 1 elif action[0] < -1: action[0] = -1 # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() Rs[0][pick - 1] = avg_reward return (Rs)
def pilco_run(env, N, J, safe=False, name='', seed=0, cont=None, rew=None, SUBS=1, sim_timesteps=50, plan_timesteps=30, restarts=1, maxiter=100, m_init=None, S_init=None, fixed_noise=None, logging=False, eval_runs=5, eval_max_timesteps=None, variable_episode_length=False): np.random.seed(seed) X, Y, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) for i in range(1, J): X_, Y_, _, _ = rollout(env=env, pilco=None, timesteps=sim_timesteps, random=True, SUBS=SUBS) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim if cont is None: controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5) elif cont['type'] == 'rbf': controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=cont['basis_functions'], max_action=cont.get('max_action', 1.0)) elif cont['type'] == 'linear': controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=cont.get('max_action', 1.0)) else: ValueError('Invalid Controller') if rew is None: reward = None elif rew['type'] == 'exp': reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W']) else: ValueError('This function only handles Exponential rewards for now') pilco = PILCO((X, Y), controller=controller, reward=reward, horizon=plan_timesteps, m_init=m_init, S_init=S_init) if fixed_noise is not None: for model in pilco.mgpr.models: model.likelihood.variance.assign(fixed_noise) set_trainable(model.likelihood.variance, False) evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) if name == '': from datetime import datetime current_time = datetime.now() name = current_time.strftime("%d_%m_%Y_%H_%M_%S") for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models() pilco.optimize_policy(maxiter=maxiter, restarts=restarts) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=sim_timesteps, SUBS=SUBS, verbose=True) cur_rew = 0 X = np.vstack((X, X_new[:plan_timesteps, :])) Y = np.vstack((Y, Y_new[:plan_timesteps, :])) pilco.mgpr.set_data((X, Y)) if logging: if eval_max_timesteps is None: eval_max_timesteps = sim_timesteps for k in range(0, eval_runs): [ X_eval_, _, evaluation_returns_sampled[rollouts, k], evaluation_returns_full[rollouts, k] ] = rollout(env, pilco, timesteps=eval_max_timesteps, verbose=False, SUBS=SUBS, render=False) if rollouts == 0 and k == 0: X_eval = X_eval_.copy() else: X_eval = np.vstack((X_eval, X_eval_)) if not os.path.exists("results/" + name): os.makedirs("results/" + name) np.savetxt("results/" + name + "X_" + str(seed) + ".csv", X, delimiter=',') np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_sampled_" + str(seed) + ".csv", evaluation_returns_sampled, delimiter=',') np.savetxt("results/" + name + "evaluation_returns_full_" + str(seed) + ".csv", evaluation_returns_full, delimiter=',')
Y = np.vstack((Y, Y_)).astype(np.float64) for i in range(1,4): #No action sampling; u := 0 X_, Y_ = rollout(env=env, pilco=None, random=None, timesteps=80, SUBS=subs, render=False) X = np.vstack((X, X_)).astype(np.float64) Y = np.vstack((Y, Y_)).astype(np.float64) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high) #controller = LinearController(state_dim=state_dim, control_dim=control_dim) #print(X) #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72) # Example of user provided reward function, setting a custom target state R = ExponentialReward(state_dim=state_dim, t=target) pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, num_induced_points=72, m_init=m_init) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False plot_model = False # Plot model set to false until iteration 5 is reached for rollouts in range(20): print("Optimizing models") pilco.optimize_models() print("Optimizing controller") pilco.optimize_policy(maxiter=20) #import pdb; pdb.set_trace() X_new, Y_new = rollout(env=env, pilco=pilco, timesteps=120, SUBS=subs, render=False) print("No of ops:", len(tf.get_default_graph().get_operations()))
if __name__ == '__main__': bf = 10 max_action = 1.0 target = np.array([0., 0., 0., 0.]) weights = np.diag([0.5, 0.1, 0.5, 0.25]) T = 400 state_dim = 4 control_dim = 1 with tf.Session() as sess: env = gym.make('continuous-cartpole-v0') controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-5', controller=controller, reward=R, sparse=False) pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :] env_s = gym.make('continuous-cartpole-v0') env_t = gym.make('continuous-cartpole-v1') padjust(env_s, env_t, pilco)
weights_l[0] = 0.1 max_ang = 100 / 180 * np.pi t1 = np.zeros(state_dim) t1[2] = max_ang w1 = 1e-6 * np.eye(state_dim) w1[2, 2] = 10 t2 = np.zeros(state_dim) t2[3] = max_ang w2 = 1e-6 * np.eye(state_dim) w2[3, 3] = 10 t3 = np.zeros(state_dim) t3[2] = -max_ang t4 = np.zeros(state_dim) t4[3] = -max_ang R2 = LinearReward(state_dim, weights_l) R3 = ExponentialReward(state_dim, W=w1, t=t1) R4 = ExponentialReward(state_dim, W=w2, t=t2) R5 = ExponentialReward(state_dim, W=w1, t=t3) R6 = ExponentialReward(state_dim, W=w2, t=t4) R = CombinedRewards(state_dim, [R2, R3, R4, R5, R6], coefs=[1.0, -1.0, -1.0, -1.0, -1.0]) # Initial random rollouts to generate a dataset X, Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS) for i in range(1, J): X_, Y_ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True)