for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) r_new = np.zeros((T, 1)) for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=maxiter, restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=True) # Since we had decide on the various parameters of the reward function # we might want to verify that it behaves as expected by inspection for i in range(len(X_new)): r_new[:, 0] = R.compute_reward(X_new[i, None, :-1], 0.001 * np.eye(state_dim))[0] total_r = sum(r_new) _, _, r = pilco.predict(X_new[0, None, :-1], 0.001 * S_init, T) print("Total ", total_r, " Predicted: ", r) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_data((X, Y))
def safe_swimmer_run(seed=0, logging=False): env = SwimmerWrapper() state_dim = 9 control_dim = 2 SUBS = 2 maxiter = 60 max_action = 1.0 m_init = np.reshape(np.zeros(state_dim), (1, state_dim)) # initial state mean S_init = 0.05 * np.eye(state_dim) J = 1 N = 12 T = 25 bf = 30 T_sim = 100 # Reward function that dicourages the joints from hitting their max angles weights_l = np.zeros(state_dim) weights_l[0] = 0.5 max_ang = (100 / 180 * np.pi) * 0.95 R1 = LinearReward(state_dim, weights_l) C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False) C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False) C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False) R = CombinedRewards(state_dim, [R1, C1, C2, C3], coefs=[1.0, -10.0, -10.0, -10.0]) th = 0.2 # Initial random rollouts to generate a dataset X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) for i in range(1, J): X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) new_data = True eval_runs = T_sim evaluation_returns_full = np.zeros((N, eval_runs)) evaluation_returns_sampled = np.zeros((N, eval_runs)) X_eval = [] for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") if new_data: pilco.optimize_models(maxiter=100) new_data = False pilco.optimize_policy(maxiter=1, restarts=2) m_p = np.zeros((T, state_dim)) S_p = np.zeros((T, state_dim, state_dim)) predicted_risk1 = np.zeros(T) predicted_risk2 = np.zeros(T) predicted_risk3 = np.zeros(T) for h in range(T): m_h, S_h, _ = pilco.predict(m_init, S_init, h) m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :] predicted_risk1[h], _ = C1.compute_reward(m_h, S_h) predicted_risk2[h], _ = C2.compute_reward(m_h, S_h) predicted_risk3[h], _ = C3.compute_reward(m_h, S_h) estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1) estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2) estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3) overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * ( 1 - estimate_risk3) if overall_risk < th: X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS) new_data = True # Update dataset X = np.vstack((X, X_new[:T, :])) Y = np.vstack((Y, Y_new[:T, :])) pilco.mgpr.set_data((X, Y)) if estimate_risk1 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0]) if estimate_risk2 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0]) if estimate_risk3 < th / 10: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75]) else: print("*********CHANGING***********") if estimate_risk1 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0]) if estimate_risk2 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0]) if estimate_risk3 > th / 3: R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5]) _, _, r = pilco.predict(m_init, S_init, T)
ep_rewards[i] = sum(all_Rs[i * T:i * T + T]) for model in pilco.mgpr.models: model.likelihood.variance.assign(0.05) set_trainable(model.likelihood.variance, False) r_new = np.zeros((T, 1)) for rollouts in range(5): pilco.optimize_models() pilco.optimize_policy(maxiter=100, restarts=3) import pdb pdb.set_trace() X_new, Y_new, _, _ = rollout(env=env, pilco=pilco, timesteps=T, SUBS=SUBS, render=True) for i in range(len(X_new)): r_new[:, 0] = R.compute_reward(X_new[i, None, :-1], 0.001 * np.eye(state_dim))[0] total_r = sum(r_new) _, _, r = pilco.predict(m_init, S_init, T) print("Total ", total_r, " Predicted: ", r) X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) all_Rs = np.vstack((all_Rs, r_new)) ep_rewards = np.vstack((ep_rewards, np.reshape(total_r, (1, 1)))) pilco.mgpr.set_data((X, Y))