def plot_pilco_source_learning_curve():
    env = gym.make('continuous-cartpole-v0')
    env.seed(73)

    pilcos = ['initial'] + [str(i) for i in range(6)]

    rewards = []
    for i, p in enumerate(pilcos):
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p),
                           controller=controller,
                           reward=R,
                           sparse=False)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0

        xs = []
        angles = []

        while True:
            xs.append(state[0])
            angles.append(state[2])
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

        plt.plot(xs, angles)
        plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight")
        plt.close()

    env.close()

    plt.plot([i for i, _ in enumerate(pilcos)], rewards)
    plt.savefig('pilco_rewards_plot', bbox_inches="tight")
    plt.close()

    return rewards, xs, angles
def loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    # observation_space = env.observation_space.shape[0]

    run = 0
    while True:
        run += 1
        state = env.reset()
        step = 0
        while True:
            step += 1
            env.render()

            #TODO RUN PI ADJUST
            action = utils.policy(env, pilco, state, False)
            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(action)
            # reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Esempio n. 3
0
    def get_RewardFun_and_Controller(self, state_dim, control_dim, data_mean,
                                     data_std, max_yerror, W):
        t1 = np.divide([max_yerror, 0.0, 0.0, 0.0] - data_mean, data_std)
        t2 = np.divide([-max_yerror, 0.0, 0.0, 0.0] - data_mean, data_std)
        R1 = ExponentialReward(state_dim, W, t=t1)
        R2 = ExponentialReward(state_dim, W, t=t2)
        target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std)
        Re = ExponentialReward(state_dim=state_dim, t=target, W=W)
        R = CombinedRewards(state_dim, [Re, R1, R2], coefs=[2, -1, -1])

        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=30)
        # num_basis_functions: number of RBFuntions, a large number of functions is flexber but increases computational demands.
        # If v==4: num_basis_functions=50
        return R, controller
def load_and_run_model(env, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    print('Running {:s}'.format(name))
    rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def piadjust(NT, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    env_S = gym.make('continuous-cartpole-v0')
    env_S.seed(73)

    env_T = gym.make('continuous-cartpole-v99')
    env_T.seed(73)

    D_S = sampler(pilco, env_S, samples_n=30, trials=50)
    # print('D_S sampling done')

    D_T = None
    pi_adj = pilco

    for i in range(NT):
        print('{:d}/{:d}'.format(i + 1, NT))

        D_adj = []

        if i == 0:
            D_i_T = sampler(pilco, env_T, samples_n=30)
        elif i != 0:
            D_i_T = sampler_adj(pi_adj, pilco, env_T, 30)

        if D_T is not None:
            D_T = np.concatenate((D_i_T, D_T))
        elif D_T is None:
            D_T = D_i_T

        # print('Going for inverse dyn')
        gpr = inverse_dyn(D_T)
        # print('inverse dyn done')

        for samp in D_S:
            x_s = list(samp[0])
            x_s1 = list(samp[2])
            u_t_S = samp[1]

            a = np.array(x_s + x_s1).reshape(1, 8)
            u_t_T = gpr.predict(a, return_std=False)

            D_adj.append((x_s, u_t_S, u_t_T - u_t_S))

        # print('Going for L3')
        pi_adj = L3(D_adj)
        # print('L3 Done')

        # i = i + 1
        # if (i % 1 == 0):
        save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i))

    env_S.env.close()
    env_T.env.close()

    return pi_adj
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5',
                    transfer_name='{:d}true_dyn_pi_adj.pkl',
                    adjust=True):
    env = gym.make('continuous-cartpole-v99')
    env.seed(1)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco(pilco_name,
                       controller=controller,
                       reward=R,
                       sparse=False)

    rewards = []

    for i in range(10):
        print('Running {:s}'.format(transfer_name.format(i)))
        if adjust:
            with open(transfer_name.format(i), 'rb') as inp2:
                pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0
        while True:
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            if adjust:
                pi_adjust_action = pi_adjust.predict(
                    np.array(a).reshape(1, -1))[0]
            else:
                pi_adjust_action = 0  # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT

            state_next, reward, terminal, info = env.step(u_action +
                                                          pi_adjust_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

    env.close()
    return rewards
def test_reward():
    '''
    Test reward function by comparing to reward.m
    '''
    k = 2  # state dim
    m = np.random.rand(1, k)
    s = np.random.rand(k, k)
    s = s.dot(s.T)

    reward = ExponentialReward(k)
    W = reward.W.numpy()
    t = reward.t.numpy()

    M, S = reward.compute_reward(m, s)

    M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4)

    np.testing.assert_allclose(M, M_mat)
    np.testing.assert_allclose(S, S_mat)
Esempio n. 8
0
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True):
    env = gym.make(env_name)
    env.seed(1)
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False)

    if transfer_name is not None:
        with open(transfer_name, 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

    xs = []
    angles = []

    state = env.reset()
    for _ in range(1000):
        xs.append(state[0])
        angles.append(state[2])
        env.render()

        u_action = policy(env, pilco, state, False)
        state_copy = state

        a = np.ndarray.tolist(state_copy)
        a.extend(np.ndarray.tolist(u_action))

        if transfer_name is not None:
            pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]
        else:
            pi_adjust_action = 0

        state_next, reward, terminal, info = env.step(u_action + pi_adjust_action)
        state = state_next

        if terminal:
            break

    env.close()

    xs = np.array(xs)
    angles = np.array(angles)

    plt.plot(xs, angles)
    plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2)
    plt.xlabel('position')
    plt.ylabel('angle')
    plt.title(fig_title)
    plt.xlim(-0.2, 0.2)
    plt.ylim(-0.2, 0.2)

    if save:
        plt.savefig(fig_file_name, bbox_inches="tight")
        plt.close()
Esempio n. 9
0
def test_reward():
    '''
    Test reward function by comparing to reward.m
    '''
    k = 2  # state dim
    m = np.random.rand(1, k)
    s = np.random.randn(k, k)
    s = s.dot(s.T)

    reward = ExponentialReward(k)
    W = reward.W.data.cpu().numpy()
    t = reward.t.data.cpu().numpy()

    M, S = reward.compute_reward(
        torch.tensor(m).float().cuda(),
        torch.tensor(s).float().cuda())

    M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4)
    import pdb
    pdb.set_trace()

    np.testing.assert_allclose(M.cpu().numpy(), M_mat)
    np.testing.assert_allclose(S.cpu().numpy(), S_mat)
Esempio n. 10
0
def souce_loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    env = gym.make('continuous-cartpole-v99')

    pi_adjust = None

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    avg_reward = 0
    while run != 101:
        run += 1
        if (run % 20 == 0):
            print('run:  ', run)
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            # env.render()

            # TODO RUN PI ADJUST
            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                # print("Run: "  + ", score: " + str(step))
                score_logger.add_score(step, run)
                avg_reward = avg_reward + step
                break
    avg_reward = avg_reward / run
    env.env.close()
    return (avg_reward)
Esempio n. 11
0
def true_loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    with open('9true_dyn_pi_adj.pkl', 'rb') as inp2:
        pi_adjust = pickle.load(inp2)

    # with open('10_pi_adj.pkl', 'rb') as inp2:
    #     good_pi = pickle.load(inp2)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))
            action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]

            state_next, reward, terminal, info = env.step(action + u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print("Run: " + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Esempio n. 12
0
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)

    # for numerical stability, we can set the likelihood variance parameters of the GP models
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    r_new = np.zeros((T, 1))
    for rollouts in range(N):
Esempio n. 13
0
def swimmer_run(name, seed):
    env = gym.make('Swimmer-v2').env
    #env = SwimmerWrapper()
    state_dim = 8
    control_dim = 2
    SUBS = 5
    maxiter = 120
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.005 * np.eye(state_dim)
    J = 10
    N = 15
    T = 15
    bf = 40
    T_sim = 50

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[3] = 1.0
    max_ang = 95 / 180 * np.pi
    t1 = np.zeros(state_dim)
    t1[2] = max_ang
    w1 = 1e-6 * np.eye(state_dim)
    w1[2, 2] = 10
    t2 = np.zeros(state_dim)
    t2[1] = max_ang
    w2 = 1e-6 * np.eye(state_dim)
    w2[1, 1] = 10
    t5 = np.zeros(state_dim)
    #t5[0] = max_ang
    #w3 = 1e-6 * np.eye(state_dim)
    #w3[0,0] = 5
    t3 = np.zeros(state_dim)
    t3[2] = -max_ang
    t4 = np.zeros(state_dim)
    t4[1] = -max_ang
    #t6 = np.zeros(state_dim); t6[0] = -max_ang
    R2 = LinearReward(state_dim, weights_l)
    R3 = ExponentialReward(state_dim, W=w1, t=t1)
    R4 = ExponentialReward(state_dim, W=w2, t=t2)
    R5 = ExponentialReward(state_dim, W=w1, t=t3)
    R6 = ExponentialReward(state_dim, W=w2, t=t4)
    #R7 = ExponentialReward(state_dim, W=w3, t=t5)
    #R8 = ExponentialReward(state_dim, W=w3, t=t6)
    Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6],
                          coefs=[1.0, -1.0, -1.0, -1.0, -1.0])
    # Rew = R2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=Rew,
                  m_init=m_init,
                  S_init=S_init)
    #for model in pilco.mgpr.models:
    #    model.likelihood.variance = 0.0001
    #    model.likelihood.variance.trainable = False

    logging = True  # To save results in .csv files turn this flag to True
    eval_runs = 10
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    eval_max_timesteps = 1000 // SUBS
    X_eval = False
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=T_sim,
                                     verbose=True,
                                     SUBS=SUBS,
                                     render=False)

        cur_rew = 0
        for t in range(0, len(X_new)):
            cur_rew += Rew.compute_reward(
                X_new[t, 0:state_dim, None].transpose(),
                0.0001 * np.eye(state_dim))[0]
            if t == T:
                print(
                    'On this episode, on the planning horizon, PILCO reward was: ',
                    cur_rew)
        print('On this episode PILCO reward was ', cur_rew)

        gym_steps = 1000
        T_eval = gym_steps // SUBS
        # Update dataset
        X = np.vstack((X, X_new[:T, :]))
        Y = np.vstack((Y, Y_new[:T, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists(name):
                os.makedirs(name)
            np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',')
            np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',')
            np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt(name + "evaluation_returns_full_" + seed + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
Esempio n. 14
0
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    print('experience collected. ')

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=5)
    #controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    # pilco = PILCO(X, Y, controller=controller, horizon=40)
    # Example of user provided reward function, setting a custom target state
    R = ExponentialReward(state_dim=state_dim, t=np.array([0.1, 0, 0, 0]))
    pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)

    # Example of fixing a parameter, optional, for a linear controller only
    #pilco.controller.b = np.array([[0.0]])
    #pilco.controller.b.trainable = False

    for rollouts in range(1):
        print('optimizing models')
        pilco.optimize_models()
        print('optimizing policy')
        pilco.optimize_policy()
        # import pdb; pdb.set_trace()
        X_new, Y_new = rollout(env=env,
                               pilco=pilco,
                               timesteps=100,
Esempio n. 15
0
max_action = 1.0
m_init = np.reshape(np.zeros(state_dim), (1, state_dim))  # initial state mean
S_init = 0.005 * np.eye(state_dim)
J = 10
N = 15
T = 15
bf = 40
T_sim = 50

# Reward function that dicourages the joints from hitting their max angles
max_ang = 95 / 180 * np.pi
rewards = []
rewards.append(LinearReward(state_dim, [0, 0, 0, 1.0, 0, 0, 0, 0]))
rewards.append(
    ExponentialReward(state_dim,
                      W=np.diag(np.array([0, 0, 10, 0, 0, 0, 0, 0]) + 1e-6),
                      t=[0, 0, max_ang, 0, 0, 0, 0, 0]))
rewards.append(
    ExponentialReward(state_dim,
                      W=np.diag(np.array([0, 0, 10, 0, 0, 0, 0, 0]) + 1e-6),
                      t=[0, 0, -max_ang, 0, 0, 0, 0, 0]))
rewards.append(
    ExponentialReward(state_dim,
                      W=np.diag(np.array([0, 10, 0, 0, 0, 0, 0, 0]) + 1e-6),
                      t=[0, max_ang, 0, 0, 0, 0, 0, 0]))
rewards.append(
    ExponentialReward(state_dim,
                      W=np.diag(np.array([0, 10, 0, 0, 0, 0, 0, 0]) + 1e-6),
                      t=[0, -max_ang, 0, 0, 0, 0, 0, 0]))
combined_reward = CombinedRewards(state_dim,
                                  rewards,
                     np.std(X1[:, :2], 0))
X = np.zeros(X1.shape)
X[:, :2] = np.divide(X1[:, :2] - np.mean(X1[:, :2], 0), np.std(X1[:, :2], 0))
X[:, 2] = X1[:, -1]  # control inputs are not normalised
Y = np.divide(Y1, np.std(X1[:, :2], 0))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
m_init = np.transpose(X[0, :-1, None])
S_init = 0.5 * np.eye(state_dim)
controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=25)

R = ExponentialReward(state_dim=state_dim,
                      t=np.divide([0.5, 0.0] - env.m, env.std),
                      W=np.diag([0.5, 0.1]))
pilco = PILCO((X, Y),
              controller=controller,
              horizon=T,
              reward=R,
              m_init=m_init,
              S_init=S_init)

best_r = 0
all_Rs = np.zeros((X.shape[0], 1))
for i in range(len(all_Rs)):
    all_Rs[i, 0] = R.compute_reward(X[i, None, :-1],
                                    0.001 * np.eye(state_dim))[0]

ep_rewards = np.zeros((len(X) // T, 1))
Esempio n. 17
0
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim

controller = LinearController(state_dim=state_dim,
                              control_dim=control_dim,
                              max_action=1.0)

if safe:
    if experim_num == 1:
        C1 = SingleConstraint(0, high=max_temp, inside=False)
    else:
        C1 = SingleConstraint(1, high=max_temp, inside=False)
    R1 = ExponentialReward(state_dim=state_dim, t=targets)
    R = CombinedRewards(state_dim, [R1, C1], coefs=[1.0, -15.0])
else:
    R1 = ExponentialReward(state_dim=state_dim, t=targets, W=weights)
    t2 = targets.copy()
    t2[1] = 20.5
    w2 = weights.copy()
    w2[1, 1] = 4.0
    R2 = ExponentialReward(state_dim=state_dim, t=t2, W=w2)
    R = CombinedRewards(state_dim, [R1, R2], coefs=[1.0, -2.0])

pilco = SafePILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward_add=R1,
                  reward_mult=C1,
Esempio n. 18
0
def loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    for pick in range(1, 11):
        env = gym.make('continuous-cartpole-v99')

        with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
        run = 0
        avg_reward = 0
        while run != 101:
            run += 1
            if (run % 20 == 0):
                print('run:  ', run)
            state = env.reset()
            # print(state)
            # input()
            step = 0
            while True:
                step += 1
                #env.render()

                #TODO RUN PI ADJUST
                u_action = utils.policy(env, pilco, state, False)
                state_copy = state

                a = np.ndarray.tolist(state_copy)
                a.extend(np.ndarray.tolist(u_action))
                action = pi_adjust.predict(np.array(a).reshape(1, -1))
                action = action[0]
                if action[0] > 1:
                    action[0] = 1
                elif action[0] < -1:
                    action[0] = -1
                # TODO RUN PI ADJUST COMMENT THE NEXT LINE

                state_next, reward, terminal, info = env.step(action)
                reward = reward if not terminal else -reward
                state = state_next
                if terminal:
                    # print("Run: "  + ", score: " + str(step))
                    score_logger.add_score(step, run)
                    avg_reward = avg_reward + step
                    break
        avg_reward = avg_reward / run
        env.env.close()
        Rs[0][pick - 1] = avg_reward
    return (Rs)
def pilco_run(env,
              N,
              J,
              safe=False,
              name='',
              seed=0,
              cont=None,
              rew=None,
              SUBS=1,
              sim_timesteps=50,
              plan_timesteps=30,
              restarts=1,
              maxiter=100,
              m_init=None,
              S_init=None,
              fixed_noise=None,
              logging=False,
              eval_runs=5,
              eval_max_timesteps=None,
              variable_episode_length=False):
    np.random.seed(seed)
    X, Y, _, _ = rollout(env=env,
                         pilco=None,
                         timesteps=sim_timesteps,
                         random=True,
                         SUBS=SUBS)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env=env,
                               pilco=None,
                               timesteps=sim_timesteps,
                               random=True,
                               SUBS=SUBS)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    if cont is None:
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
    elif cont['type'] == 'rbf':
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=cont['basis_functions'],
                                   max_action=cont.get('max_action', 1.0))
    elif cont['type'] == 'linear':
        controller = LinearController(state_dim=state_dim,
                                      control_dim=control_dim,
                                      max_action=cont.get('max_action', 1.0))
    else:
        ValueError('Invalid Controller')

    if rew is None:
        reward = None
    elif rew['type'] == 'exp':
        reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W'])
    else:
        ValueError('This function only handles Exponential rewards for now')

    pilco = PILCO((X, Y),
                  controller=controller,
                  reward=reward,
                  horizon=plan_timesteps,
                  m_init=m_init,
                  S_init=S_init)

    if fixed_noise is not None:
        for model in pilco.mgpr.models:
            model.likelihood.variance.assign(fixed_noise)
            set_trainable(model.likelihood.variance, False)

    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    if name == '':
        from datetime import datetime
        current_time = datetime.now()
        name = current_time.strftime("%d_%m_%Y_%H_%M_%S")
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models()
        pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=sim_timesteps,
                                     SUBS=SUBS,
                                     verbose=True)

        cur_rew = 0
        X = np.vstack((X, X_new[:plan_timesteps, :]))
        Y = np.vstack((Y, Y_new[:plan_timesteps, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0 and k == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists("results/" + name):
                os.makedirs("results/" + name)
            np.savetxt("results/" + name + "X_" + str(seed) + ".csv",
                       X,
                       delimiter=',')
            np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv",
                       X_eval,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_sampled_" +
                       str(seed) + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_full_" +
                       str(seed) + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
Esempio n. 20
0
        Y = np.vstack((Y, Y_)).astype(np.float64)
    for i in range(1,4): #No action sampling; u := 0
        X_, Y_ = rollout(env=env, pilco=None, random=None,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)        


    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high)
    #controller = LinearController(state_dim=state_dim, control_dim=control_dim)
    #print(X)
    #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72)
    
    # Example of user provided reward function, setting a custom target state
    R = ExponentialReward(state_dim=state_dim, t=target)
    pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, 
                        num_induced_points=72, m_init=m_init)

    # Example of fixing a parameter, optional, for a linear controller only
    #pilco.controller.b = np.array([[0.0]])
    #pilco.controller.b.trainable = False
    plot_model = False # Plot model set to false until iteration 5 is reached
    for rollouts in range(20):
        print("Optimizing models") 
        pilco.optimize_models()
        print("Optimizing controller")
        pilco.optimize_policy(maxiter=20)
        #import pdb; pdb.set_trace()
        X_new, Y_new = rollout(env=env, pilco=pilco, timesteps=120, SUBS=subs, render=False)
        print("No of ops:", len(tf.get_default_graph().get_operations()))
Esempio n. 21
0

if __name__ == '__main__':
    bf = 10
    max_action = 1.0
    target = np.array([0., 0., 0., 0.])
    weights = np.diag([0.5, 0.1, 0.5, 0.25])
    T = 400

    state_dim = 4
    control_dim = 1

    with tf.Session() as sess:
        env = gym.make('continuous-cartpole-v0')

        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-5',
                           controller=controller,
                           reward=R,
                           sparse=False)

        pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :]

        env_s = gym.make('continuous-cartpole-v0')
        env_t = gym.make('continuous-cartpole-v1')
        padjust(env_s, env_t, pilco)
Esempio n. 22
0
    weights_l[0] = 0.1
    max_ang = 100 / 180 * np.pi
    t1 = np.zeros(state_dim)
    t1[2] = max_ang
    w1 = 1e-6 * np.eye(state_dim)
    w1[2, 2] = 10
    t2 = np.zeros(state_dim)
    t2[3] = max_ang
    w2 = 1e-6 * np.eye(state_dim)
    w2[3, 3] = 10
    t3 = np.zeros(state_dim)
    t3[2] = -max_ang
    t4 = np.zeros(state_dim)
    t4[3] = -max_ang
    R2 = LinearReward(state_dim, weights_l)
    R3 = ExponentialReward(state_dim, W=w1, t=t1)
    R4 = ExponentialReward(state_dim, W=w2, t=t2)
    R5 = ExponentialReward(state_dim, W=w1, t=t3)
    R6 = ExponentialReward(state_dim, W=w2, t=t4)
    R = CombinedRewards(state_dim, [R2, R3, R4, R5, R6],
                        coefs=[1.0, -1.0, -1.0, -1.0, -1.0])

    # Initial random rollouts to generate a dataset
    X, Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS)
    for i in range(1, J):
        X_, Y_ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True)