Beispiel #1
0
                               control_dim=control_dim,
                               num_basis_functions=5)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    pilco = PILCO(X, Y, controller=controller, horizon=40)
    # Example of user provided reward function, setting a custom target state
    # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0]))
    # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)

    # Example of fixing a parameter, optional, for a linear controller only
    # pilco.controller.b = np.array([[0.0]])
    # pilco.controller.b.trainable = False

    for rollouts in range(3):
        pilco.optimize_models()
        pilco.optimize_policy()
        X_new, Y_new, _ = rollout(env=env, pilco=pilco, timesteps=100)

        # Update dataset
        X = np.vstack((X, X_new))
        Y = np.vstack((Y, Y_new))
        pilco.mgpr.set_XY(X, Y)

    input("done training! Press enter to evaluate the policy")
    trained_rewards = []
    for i in range(1, 100):
        X_, Y_, rewards = rollout(env=env, pilco=pilco, timesteps=100)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
        trained_rewards.append(sum(rewards))
    pilco = PILCO((X, Y), controller=controller, horizon=T, m_init=m_init)
    pilco.controller.max_action = e

    # for numerical stability
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)
        # model.likelihood.fixed=True

    return_lst = []
    task_length = []
    for rollouts in range(100):
        print("**** ITERATION no.", rollouts, " ****")
        try:
            pilco.optimize_models(maxiter=maxiter)
            pilco.optimize_policy(maxiter=maxiter)
        except:
            # for i in range(len(return_lst)):
            #     return_lst[i] = str(return_lst[i])
            # df = pd.DataFrame(return_lst, columns=['Return per epoch'])
            # df.to_csv(('/home/lab/Github/PILCO/log/Return/{}.csv'.format(time.time())))
            print('Start Error!!!!!!!!!!!!!!!!!!')

        X_new, Y_new, _, ep_return_lst, ep_length = rollout(env=env,
                                                            pilco=pilco,
                                                            timesteps=1000,
                                                            render=False,
                                                            verbose=verbose)
        return_lst.append(ep_return_lst)
        task_length.append(ep_length)
        # Update dataset
Beispiel #3
0
def safe_swimmer_run(seed=0, logging=False):
    env = SwimmerWrapper()
    state_dim = 9
    control_dim = 2
    SUBS = 2
    maxiter = 60
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.05 * np.eye(state_dim)
    J = 1
    N = 12
    T = 25
    bf = 30
    T_sim = 100

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[0] = 0.5
    max_ang = (100 / 180 * np.pi) * 0.95
    R1 = LinearReward(state_dim, weights_l)

    C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False)
    C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False)
    C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False)
    R = CombinedRewards(state_dim, [R1, C1, C2, C3],
                        coefs=[1.0, -10.0, -10.0, -10.0])

    th = 0.2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    new_data = True
    eval_runs = T_sim
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=1, restarts=2)

        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risk1 = np.zeros(T)
        predicted_risk2 = np.zeros(T)
        predicted_risk3 = np.zeros(T)
        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risk1[h], _ = C1.compute_reward(m_h, S_h)
            predicted_risk2[h], _ = C2.compute_reward(m_h, S_h)
            predicted_risk3[h], _ = C3.compute_reward(m_h, S_h)
        estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1)
        estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2)
        estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3)
        overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * (
            1 - estimate_risk3)
        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco,
                                         timesteps=T_sim,
                                         verbose=True,
                                         SUBS=SUBS)
            new_data = True
            # Update dataset
            X = np.vstack((X, X_new[:T, :]))
            Y = np.vstack((Y, Y_new[:T, :]))
            pilco.mgpr.set_data((X, Y))
            if estimate_risk1 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0])
            if estimate_risk2 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0])
            if estimate_risk3 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75])
        else:
            print("*********CHANGING***********")
            if estimate_risk1 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0])
            if estimate_risk2 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0])
            if estimate_risk3 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5])
            _, _, r = pilco.predict(m_init, S_init, T)
Beispiel #4
0
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

    pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

    # for numerical stability
    for model in pilco.mgpr.models:
        # model.kern.lengthscales.prior = gpflow.priors.Gamma(1,10) priors have to be included before
        # model.kern.variance.prior = gpflow.priors.Gamma(1.5,2)    before the model gets compiled
        model.likelihood.variance = 0.001
        model.likelihood.variance.trainable = False

    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(maxiter=maxiter, restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS)

        # Since we had decide on the various parameters of the reward function
        # we might want to verify that it behaves as expected by inspection
        # cur_rew = 0
        # for t in range(0,len(X_new)):
        #     cur_rew += reward_wrapper(R, X_new[t, 0:state_dim, None].transpose(), 0.0001 * np.eye(state_dim))[0]
        # print('On this episode reward was ', cur_rew)

        # Update dataset
        X = np.vstack((X, X_new[:T, :])); Y = np.vstack((Y, Y_new[:T, :]))
        pilco.mgpr.set_XY(X, Y)

        lens.append(len(X_new))
Beispiel #5
0
control_dim = X.shape[1] - state_dim
controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=10,
                           max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
pilco = PILCO(X,
              Y,
              controller=controller,
              horizon=T,
              reward=R,
              m_init=m_init,
              S_init=S_init)

pilco.optimize_models(maxiter=100)
pilco.optimize_policy(maxiter=20)

# Rollout using the pilco controller
X_new, Y_new = rollout(env, pilco, timesteps=T, SUBS=SUBS, render=False)

for i, m in enumerate(pilco.mgpr.models):
    y_pred_test, var_pred_test = m.predict_y(X_new)
    plt.plot(range(len(y_pred_test)), y_pred_test, Y_new[:, i])
    plt.fill_between(range(len(y_pred_test)),
                     y_pred_test[:, 0] - 2 * np.sqrt(var_pred_test[:, 0]),
                     y_pred_test[:, 0] + 2 * np.sqrt(var_pred_test[:, 0]),
                     alpha=0.3)
    plt.show()

np.shape(var_pred_test)
Beispiel #6
0
def swimmer_run(name, seed):
    env = gym.make('Swimmer-v2').env
    #env = SwimmerWrapper()
    state_dim = 8
    control_dim = 2
    SUBS = 5
    maxiter = 120
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.005 * np.eye(state_dim)
    J = 10
    N = 15
    T = 15
    bf = 40
    T_sim = 50

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[3] = 1.0
    max_ang = 95 / 180 * np.pi
    t1 = np.zeros(state_dim)
    t1[2] = max_ang
    w1 = 1e-6 * np.eye(state_dim)
    w1[2, 2] = 10
    t2 = np.zeros(state_dim)
    t2[1] = max_ang
    w2 = 1e-6 * np.eye(state_dim)
    w2[1, 1] = 10
    t5 = np.zeros(state_dim)
    #t5[0] = max_ang
    #w3 = 1e-6 * np.eye(state_dim)
    #w3[0,0] = 5
    t3 = np.zeros(state_dim)
    t3[2] = -max_ang
    t4 = np.zeros(state_dim)
    t4[1] = -max_ang
    #t6 = np.zeros(state_dim); t6[0] = -max_ang
    R2 = LinearReward(state_dim, weights_l)
    R3 = ExponentialReward(state_dim, W=w1, t=t1)
    R4 = ExponentialReward(state_dim, W=w2, t=t2)
    R5 = ExponentialReward(state_dim, W=w1, t=t3)
    R6 = ExponentialReward(state_dim, W=w2, t=t4)
    #R7 = ExponentialReward(state_dim, W=w3, t=t5)
    #R8 = ExponentialReward(state_dim, W=w3, t=t6)
    Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6],
                          coefs=[1.0, -1.0, -1.0, -1.0, -1.0])
    # Rew = R2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=Rew,
                  m_init=m_init,
                  S_init=S_init)
    #for model in pilco.mgpr.models:
    #    model.likelihood.variance = 0.0001
    #    model.likelihood.variance.trainable = False

    logging = True  # To save results in .csv files turn this flag to True
    eval_runs = 10
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    eval_max_timesteps = 1000 // SUBS
    X_eval = False
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=T_sim,
                                     verbose=True,
                                     SUBS=SUBS,
                                     render=False)

        cur_rew = 0
        for t in range(0, len(X_new)):
            cur_rew += Rew.compute_reward(
                X_new[t, 0:state_dim, None].transpose(),
                0.0001 * np.eye(state_dim))[0]
            if t == T:
                print(
                    'On this episode, on the planning horizon, PILCO reward was: ',
                    cur_rew)
        print('On this episode PILCO reward was ', cur_rew)

        gym_steps = 1000
        T_eval = gym_steps // SUBS
        # Update dataset
        X = np.vstack((X, X_new[:T, :]))
        Y = np.vstack((Y, Y_new[:T, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists(name):
                os.makedirs(name)
            np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',')
            np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',')
            np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt(name + "evaluation_returns_full_" + seed + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
def pilco_run(env,
              N,
              J,
              safe=False,
              name='',
              seed=0,
              cont=None,
              rew=None,
              SUBS=1,
              sim_timesteps=50,
              plan_timesteps=30,
              restarts=1,
              maxiter=100,
              m_init=None,
              S_init=None,
              fixed_noise=None,
              logging=False,
              eval_runs=5,
              eval_max_timesteps=None,
              variable_episode_length=False):
    np.random.seed(seed)
    X, Y, _, _ = rollout(env=env,
                         pilco=None,
                         timesteps=sim_timesteps,
                         random=True,
                         SUBS=SUBS)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env=env,
                               pilco=None,
                               timesteps=sim_timesteps,
                               random=True,
                               SUBS=SUBS)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    if cont is None:
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
    elif cont['type'] == 'rbf':
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=cont['basis_functions'],
                                   max_action=cont.get('max_action', 1.0))
    elif cont['type'] == 'linear':
        controller = LinearController(state_dim=state_dim,
                                      control_dim=control_dim,
                                      max_action=cont.get('max_action', 1.0))
    else:
        ValueError('Invalid Controller')

    if rew is None:
        reward = None
    elif rew['type'] == 'exp':
        reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W'])
    else:
        ValueError('This function only handles Exponential rewards for now')

    pilco = PILCO((X, Y),
                  controller=controller,
                  reward=reward,
                  horizon=plan_timesteps,
                  m_init=m_init,
                  S_init=S_init)

    if fixed_noise is not None:
        for model in pilco.mgpr.models:
            model.likelihood.variance.assign(fixed_noise)
            set_trainable(model.likelihood.variance, False)

    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    if name == '':
        from datetime import datetime
        current_time = datetime.now()
        name = current_time.strftime("%d_%m_%Y_%H_%M_%S")
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models()
        pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=sim_timesteps,
                                     SUBS=SUBS,
                                     verbose=True)

        cur_rew = 0
        X = np.vstack((X, X_new[:plan_timesteps, :]))
        Y = np.vstack((Y, Y_new[:plan_timesteps, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0 and k == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists("results/" + name):
                os.makedirs("results/" + name)
            np.savetxt("results/" + name + "X_" + str(seed) + ".csv",
                       X,
                       delimiter=',')
            np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv",
                       X_eval,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_sampled_" +
                       str(seed) + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_full_" +
                       str(seed) + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
        all_Rs[i,0] = R.compute_reward(X[i,None,:-1], 0.001 * np.eye(state_dim))[0] # Reward for each step

    total_rewards = np.zeros((len(X)//T,1))
    ep_rewards = np.zeros((len(X)//T,1))

    for i in range(len(total_rewards)):
        total_rewards[i] = sum(all_Rs[i * T: i*T + T])
        ep_rewards[i] = sum(all_Rs[i * T: i*T + T])
    
    
    for rollouts in range(1,31):
        # Optimazation

        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(maxiter=2000,restarts=3) # Restart to avoid local  minima
        pilco.optimize_policy(maxiter=2000,restarts=3)
        
        # Initialize
        robot_state.set_pose(x=offsetPath[0][0], y=offsetPath[1][0], yaw=offsetPath[2][0], yawRate = 0.0)
        robot_state.set_steerAngle(steerAngle=0.0)
        pathHandler.reset__index_nn()
        # Testing
        X_new, Y_new ,j = pilcotrac.rollout(pilco,max_yerror,data_mean=data_mean,data_std=data_std,lookahead= 1.22,timesteps=T_sim,random=False, verbose=False)

        r_new = np.zeros((len(X_new), 1))
        var_r = np.zeros((len(X_new), 1))
        r_tar = np.zeros((len(X_new), 1))

        for i in range(len(X_new)):
            r_new[i, 0],var_r[i,0] = R.compute_reward(X_new[i,None,:-1], S_init)
            r_tar[i, 0] = R.compute_reward(target[None, :], S_init)[0] 
Beispiel #9
0
X = X.values
Y = Y.values

print('startk = ', startk)
print('X.shape = ', X.shape)
print('Y.shape =', Y.shape)

# X = X.T; Y = Y.T
T = int(30 * 2)

m_init = np.reshape([0.0, 0.0, 0.0, 0.0], (1, 4))
S_init = np.diag([0.01, 0.01, 0.01, 0.01])

controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=10)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
pilco_model = PILCO((X, Y),
                    controller=controller,
                    horizon=T,
                    reward=R,
                    m_init=m_init,
                    S_init=S_init)

pilco_model.optimize_models(maxiter=100)
pilco_model.optimize_policy(maxiter=20)

with open(out_fname, 'wb') as wf:
    frozen_model = gpflow.utilities.freeze(pilco_model)
    dill.dump(frozen_model, wf)