def loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    # observation_space = env.observation_space.shape[0]

    run = 0
    while True:
        run += 1
        state = env.reset()
        step = 0
        while True:
            step += 1
            env.render()

            #TODO RUN PI ADJUST
            action = utils.policy(env, pilco, state, False)
            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(action)
            # reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
def plot_pilco_source_learning_curve():
    env = gym.make('continuous-cartpole-v0')
    env.seed(73)

    pilcos = ['initial'] + [str(i) for i in range(6)]

    rewards = []
    for i, p in enumerate(pilcos):
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p),
                           controller=controller,
                           reward=R,
                           sparse=False)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0

        xs = []
        angles = []

        while True:
            xs.append(state[0])
            angles.append(state[2])
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

        plt.plot(xs, angles)
        plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight")
        plt.close()

    env.close()

    plt.plot([i for i, _ in enumerate(pilcos)], rewards)
    plt.savefig('pilco_rewards_plot', bbox_inches="tight")
    plt.close()

    return rewards, xs, angles
def load_and_run_model(env, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    print('Running {:s}'.format(name))
    rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def piadjust(NT, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    env_S = gym.make('continuous-cartpole-v0')
    env_S.seed(73)

    env_T = gym.make('continuous-cartpole-v99')
    env_T.seed(73)

    D_S = sampler(pilco, env_S, samples_n=30, trials=50)
    # print('D_S sampling done')

    D_T = None
    pi_adj = pilco

    for i in range(NT):
        print('{:d}/{:d}'.format(i + 1, NT))

        D_adj = []

        if i == 0:
            D_i_T = sampler(pilco, env_T, samples_n=30)
        elif i != 0:
            D_i_T = sampler_adj(pi_adj, pilco, env_T, 30)

        if D_T is not None:
            D_T = np.concatenate((D_i_T, D_T))
        elif D_T is None:
            D_T = D_i_T

        # print('Going for inverse dyn')
        gpr = inverse_dyn(D_T)
        # print('inverse dyn done')

        for samp in D_S:
            x_s = list(samp[0])
            x_s1 = list(samp[2])
            u_t_S = samp[1]

            a = np.array(x_s + x_s1).reshape(1, 8)
            u_t_T = gpr.predict(a, return_std=False)

            D_adj.append((x_s, u_t_S, u_t_T - u_t_S))

        # print('Going for L3')
        pi_adj = L3(D_adj)
        # print('L3 Done')

        # i = i + 1
        # if (i % 1 == 0):
        save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i))

    env_S.env.close()
    env_T.env.close()

    return pi_adj
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5',
                    transfer_name='{:d}true_dyn_pi_adj.pkl',
                    adjust=True):
    env = gym.make('continuous-cartpole-v99')
    env.seed(1)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco(pilco_name,
                       controller=controller,
                       reward=R,
                       sparse=False)

    rewards = []

    for i in range(10):
        print('Running {:s}'.format(transfer_name.format(i)))
        if adjust:
            with open(transfer_name.format(i), 'rb') as inp2:
                pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0
        while True:
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            if adjust:
                pi_adjust_action = pi_adjust.predict(
                    np.array(a).reshape(1, -1))[0]
            else:
                pi_adjust_action = 0  # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT

            state_next, reward, terminal, info = env.step(u_action +
                                                          pi_adjust_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

    env.close()
    return rewards
Esempio n. 6
0
def souce_loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    env = gym.make('continuous-cartpole-v99')

    pi_adjust = None

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    avg_reward = 0
    while run != 101:
        run += 1
        if (run % 20 == 0):
            print('run:  ', run)
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            # env.render()

            # TODO RUN PI ADJUST
            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                # print("Run: "  + ", score: " + str(step))
                score_logger.add_score(step, run)
                avg_reward = avg_reward + step
                break
    avg_reward = avg_reward / run
    env.env.close()
    return (avg_reward)
def true_loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    with open('9true_dyn_pi_adj.pkl', 'rb') as inp2:
        pi_adjust = pickle.load(inp2)

    # with open('10_pi_adj.pkl', 'rb') as inp2:
    #     good_pi = pickle.load(inp2)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))
            action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]

            state_next, reward, terminal, info = env.step(action + u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print("Run: " + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Esempio n. 8
0

if __name__ == '__main__':
    bf = 10
    max_action = 1.0
    target = np.array([0., 0., 0., 0.])
    weights = np.diag([0.5, 0.1, 0.5, 0.25])
    T = 400

    state_dim = 4
    control_dim = 1

    with tf.Session() as sess:
        env = gym.make('continuous-cartpole-v0')

        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-5',
                           controller=controller,
                           reward=R,
                           sparse=False)

        pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :]

        env_s = gym.make('continuous-cartpole-v0')
        env_t = gym.make('continuous-cartpole-v1')
        padjust(env_s, env_t, pilco)
Esempio n. 9
0
def loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    for pick in range(1, 11):
        env = gym.make('continuous-cartpole-v99')

        with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
        run = 0
        avg_reward = 0
        while run != 101:
            run += 1
            if (run % 20 == 0):
                print('run:  ', run)
            state = env.reset()
            # print(state)
            # input()
            step = 0
            while True:
                step += 1
                #env.render()

                #TODO RUN PI ADJUST
                u_action = utils.policy(env, pilco, state, False)
                state_copy = state

                a = np.ndarray.tolist(state_copy)
                a.extend(np.ndarray.tolist(u_action))
                action = pi_adjust.predict(np.array(a).reshape(1, -1))
                action = action[0]
                if action[0] > 1:
                    action[0] = 1
                elif action[0] < -1:
                    action[0] = -1
                # TODO RUN PI ADJUST COMMENT THE NEXT LINE

                state_next, reward, terminal, info = env.step(action)
                reward = reward if not terminal else -reward
                state = state_next
                if terminal:
                    # print("Run: "  + ", score: " + str(step))
                    score_logger.add_score(step, run)
                    avg_reward = avg_reward + step
                    break
        avg_reward = avg_reward / run
        env.env.close()
        Rs[0][pick - 1] = avg_reward
    return (Rs)