Esempio n. 1
0
def main(_):
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction))

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        config = get_config(FLAGS) or FLAGS

        # if config.env_type == 'simple':
        #   env = SimpleGymEnvironment(config)
        # else:
        #   env = GymEnvironment(config)

        env1 = ThreeDMountainCarEnv()
        env2 = MountainCarEnv()

        if not tf.test.is_gpu_available() and FLAGS.use_gpu:
            raise Exception("use_gpu flag is true when no GPUs are available")

        if not FLAGS.use_gpu:
            config.cnn_format = 'NHWC'

        agent1 = Agent(config, env2, sess)
        # agent2 = Agent(config, env2, sess)

        if FLAGS.is_train:
            agent1.train()
            # agent1.save_weight_to_pkl()
            # # test
            # with open('./weights/MountainCar-v0_h1_b.pkl', 'rb') as f:
            #   w = cPickle.load(f)
            #   print(w)
            agent1.play()
        else:
            agent.play()
Esempio n. 2
0
def main():
    # env = gym.make("MountainCar-v0")
    env = ThreeDMountainCarEnv()
    # Enabling layer_norm here is import for parameter space noise!
    model = deepq.models.mlp([64], layer_norm=True)
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-3,
                      max_timesteps=100000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.1,
                      print_freq=1,
                      param_noise=False)
    print("Saving model to mountaincar_model_working.pkl")
    act.save("mountaincar_model_working.pkl")
def main():
    env = MountainCarEnv()
    env_transfer = ThreeDMountainCarEnv()
    # Enabling layer_norm here is import for parameter space noise!
    # model = deepq.models.mlp([64], layer_norm=True)
    model = deepq.models.prog_nn([64], layer_norm=False)

    act = deepq.learn(env,
                      env_transfer,
                      q_func=model,
                      lr=1e-3,
                      max_timesteps=100000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.1,
                      print_freq=1,
                      param_noise=False)
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
Esempio n. 4
0
def main():
    # env = gym.make("MountainCar-v0")
    env = ThreeDMountainCarEnv()
    act = deepq.load("mountaincar_model_working.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            # env.render()
            env.render_orthographic()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            print(act(obs[None])[0])
            print(obs)
            episode_rew += rew
        print("Episode reward", episode_rew)
Esempio n. 5
0
    def getRandom3DInstance(self, with_velocity=True):
        env = ThreeDMountainCarEnv()
        env.reset()

        random_pos_x = np.random.uniform(low=env.min_position_x,
                                         high=env.max_position_y)
        random_pos_y = np.random.uniform(low=env.min_position_x,
                                         high=env.max_position_y)

        # TODO: calculates the maximum speed at this position
        random_velocity_x = np.random.uniform(
            low=-env.max_speed_x, high=env.max_speed_x) if with_velocity else 0
        random_velocity_y = np.random.uniform(
            low=-env.max_speed_y, high=env.max_speed_y) if with_velocity else 0

        state = [
            random_pos_x, random_pos_y, random_velocity_x, random_velocity_y
        ]
        env.set_state(state)
        action = np.random.randint(low=0, high=5)
        next_state, reward, done, info = env.step(action)
        return [state, action, next_state, reward, done]
import gym
import numpy as np
from matplotlib import pyplot as plt
import itertools
from lib.env.threedmountain_car import ThreeDMountainCarEnv

env = ThreeDMountainCarEnv()

state = env.reset()

for t in itertools.count():
    # action = env.action_space.sample()
    next_state, reward, done, info = env.step(0)
    # env.render() # yellow
    # env.render_y() #cyan
    env.render_orthographic()

    if done:
        break

    state = next_state

    # if t == 100:
    # 	env.close_gui()
    # 	break
Esempio n. 7
0
with open('data/mse_action_mappings.pkl', 'rb') as file:
    mse_action_mappings = pickle.load(file)

with open('data/q_learning.pkl', 'rb') as file:
    qlearning_2d = pickle.load(file)
    historic_predictor = qlearning_2d.estimator.predict


pprint.pprint(mse_state_mappings)
pprint.pprint(mse_action_mappings)
pprint.pprint(historic_predictor)
print(np.shape(mse_state_mappings))
print(np.shape(mse_action_mappings))

env = ThreeDMountainCarEnv()

# Feature Preprocessing: Normalize to zero mean and unit variance
# We use a few samples from the observation space to do this
observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

# Used to convert a state to a featurizes representation.
# We use RBF kernels with different variances to cover different parts of the space
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
Esempio n. 8
0
def main():
    # get envs
    mc2d_env = lib.env.mountain_car.MountainCarEnv()
    mc3d_env = ThreeDMountainCarEnv()

    # source task
    if os.path.isfile('./dsource_qlearn.npz'):
        f_read = np.load('./dsource_qlearn.npz')
        # print(f_read['dsource'].shape)
        dsource = f_read['dsource']
    else:
        qlearning_2d = ql.QLearning(mc2d_env)
        qlearning_2d.learn()
        dsource = np.array(qlearning_2d.play())
        # print(dsource.shape)
        np.savez('dsource_qlearn.npz', dsource=dsource)

    # target task
    if os.path.isfile('./dtarget_random.npz'):
        f_read = np.load('./dtarget_random.npz')
        # print(f_read['dtarget'].shape)
        dtarget = f_read['dtarget']
    else:
        random_action_3d = lib.RandomAction.RandomAction(mc3d_env)
        dtarget = np.array(random_action_3d.play())
        np.savez('./dtarget_random.npz', dtarget=dtarget)

    dtarget_x = np.array([np.append(x[0], x[1]) for x in dtarget])
    dtarget_y = np.array([x[2] for x in dtarget])

    dtarget_train_x = dtarget_x[:-100]
    dtarget_train_y = dtarget_y[:-100]
    dtarget_test_x = dtarget_x[-100:]
    dtarget_test_y = dtarget_y[-100:]

    # train one step transition model
    nn = tf.estimator.Estimator(model_fn=model_fn)

    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": dtarget_train_x},
        y=dtarget_train_y,
        num_epochs=None,
        shuffle=True)

    # print(dtarget_train_x.shape)
    # print(dtarget_train_y.shape)

    nn.train(input_fn=train_input_fn, steps=num_steps)

    test_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": dtarget_test_x},
                                                       y=dtarget_test_y,
                                                       num_epochs=1,
                                                       shuffle=False)

    ev = nn.evaluate(input_fn=test_input_fn)

    print("loss: %s" % ev["loss"])
    print("Root Mean Squared Error: %s" % ev["rmse"])

    # Find the mapping between source and target
    mc2d_states = mc2d_env.observation_space.shape[0]  # 2
    mc3d_states = mc3d_env.observation_space.shape[0]  # 4
    mc2d_actions = mc2d_env.action_space.n  # 3
    mc3d_actions = mc3d_env.action_space.n  # 5

    mse_state_mappings = np.zeros((2, ) * mc3d_states)  # 2 by 2 by 2 by 2
    mse_action_mappings = np.ndarray(
        shape=(mc3d_actions, mc2d_actions,
               mc3d_states * mc3d_states))  # 5 by 3 by 16
    mse_action_mappings.fill(-1)

    state_count = 0
    for s0 in range(mc2d_states):  # s0 is the first state of target states, x
        for s1 in range(mc2d_states):  # s1 is second state of target states, y
            for s2 in range(
                    mc2d_states):  # s2 is third state of target states, x_dot
                for s3 in range(
                        mc2d_states
                ):  # s3 is fourth state of target states, y_dot

                    state_losses = []

                    for a_mc3d in range(mc3d_actions):
                        for a_mc2d in range(mc2d_actions):
                            states = np.array(
                                [x[0] for x in dsource if x[1] == a_mc2d])
                            actions = np.array(
                                [x[1] for x in dsource if x[1] == a_mc2d])
                            n_states = np.array(
                                [x[2] for x in dsource if x[1] == a_mc2d])

                            if (states.size == 0) or (actions.size
                                                      == 0) or (n_states.size
                                                                == 0):
                                print('this happened..')  # TODO
                                # mse_action_mappings[a_mc3d, a_mc2d, state_count] = 0
                                continue

                            # transform to dsource_trans
                            actions_trans = np.ndarray(shape=(actions.size, ))
                            actions_trans.fill(a_mc3d)
                            input_trans = np.array([
                                states[:, s0], states[:, s1], states[:, s2],
                                states[:, s3], actions_trans
                            ]).T
                            # input_trans = [states_trans, actions]
                            n_states_trans = np.array([
                                n_states[:, s0], n_states[:, s1],
                                n_states[:, s2], n_states[:, s3]
                            ]).T

                            # calculate mapping error
                            test_input_fn = tf.estimator.inputs.numpy_input_fn(
                                x={"x": input_trans},
                                y=n_states_trans,
                                num_epochs=1,
                                shuffle=False)
                            ev = nn.evaluate(input_fn=test_input_fn)
                            # loss_mapping = sess.run(loss_op, feed_dict={X: input_trans, Y: n_states_trans})
                            # print('loss_mapping is {}'.format(loss_mapping))

                            state_losses.append(ev["loss"])
                            mse_action_mappings[a_mc3d, a_mc2d,
                                                state_count] = ev["loss"]

                    mse_state_mappings[s0, s1, s2, s3] = np.mean(state_losses)
                    state_count += 1

    # mse_action_mappings_result = [[np.mean(mse_action_mappings[a_mc3d, a_mc2d, :]) for a_mc2d in range(mc2d_actions)] for a_mc3d in range(mc3d_actions)]

    mse_action_mappings_result = np.zeros((mc3d_actions, mc2d_actions))
    for a_mc3d in range(mc3d_actions):
        for a_mc2d in range(mc2d_actions):
            losses_act = []
            for s in range(mc3d_states * mc3d_states):
                if mse_action_mappings[a_mc3d, a_mc2d, s] != -1:
                    # print(mse_action_mappings[a_mc3d, a_mc2d, s])
                    losses_act.append(mse_action_mappings[a_mc3d, a_mc2d, s])
            mse_action_mappings_result[a_mc3d, a_mc2d] = np.mean(losses_act)

    print('action mapping: {}'.format(mse_action_mappings_result))
    print('state mapping {}'.format(mse_state_mappings))

    print('x,x,x,x: {}'.format(mse_state_mappings[0][0][0][0]))
    print('x,x,x,x_dot: {}'.format(mse_state_mappings[0][0][0][1]))
    print('x,x,x_dot,x: {}'.format(mse_state_mappings[0][0][1][0]))
    print('x,x,x_dot,x_dot: {}'.format(mse_state_mappings[0][0][1][1]))
    print('x,x_dot,x,x: {}'.format(mse_state_mappings[0][1][0][0]))
    print('x,x_dot,x,x_dot: {}'.format(mse_state_mappings[0][1][0][1]))
    print('x,x_dot,x_dot,x: {}'.format(mse_state_mappings[0][1][1][0]))
    print('x,x_dot,x_dot,x_dot: {}'.format(mse_state_mappings[0][1][1][1]))
    print('x_dot,x,x,x: {}'.format(mse_state_mappings[1][0][0][0]))
    print('x_dot,x,x,x_dot: {}'.format(mse_state_mappings[1][0][1][0]))
    print('x_dot,x,x_dot,x: {}'.format(mse_state_mappings[1][0][1][1]))
    print('x_dot,x,x_dot,x_dot: {}'.format(mse_state_mappings[1][1][0][0]))
    print('x_dot,x_dot,x,x: {}'.format(mse_state_mappings[1][0][0][1]))
    print('x_dot,x_dot,x,x_dot: {}'.format(mse_state_mappings[1][1][0][1]))
    print('x_dot,x_dot,x_dot,x: {}'.format(mse_state_mappings[1][1][1][0]))
    print('x_dot,x_dot,x_dot,x_dot: {}'.format(mse_state_mappings[1][1][1][1]))
Esempio n. 9
0
from lib.env.cartpole import CartPoleEnv
from lib.env.threedcartpole import ThreeDCartPoleEnv
from lib.env.mountain_car import MountainCarEnv
from lib.env.acrobot import AcrobotEnv
from lib.env.atari_env import AtariEnv

ENVS_DICTIONARY = {
    '3DMountainCar': ThreeDMountainCarEnv,
    '2DMountainCar': MountainCarEnv,
    '2DCartPole': CartPoleEnv,
    '3DCartPole': ThreeDCartPoleEnv
}
#
ENVS_PATH_DICTIONARY = {
    '3DMountainCar': {
        'env': ThreeDMountainCarEnv(),
        'instances_path': '../data/3d_mountain_car/'
    },
    '2DMountainCar': {
        'env': MountainCarEnv(),
        'instances_path': '../data/2d_mountain_car/'
    },
    '2DCartPole': {
        'env': CartPoleEnv(),
        'instances_path': '../data/2d_cart_pole/'
    },
    '3DCartPole': {
        'env': ThreeDCartPoleEnv(),
        'instances_path': '../data/3d_cart_pole/'
    },
    'Acrobot': {
Esempio n. 10
0
def get_train_test_data(source_qlearn=True,
                        source_env=MountainCarEnv(),
                        target_env=ThreeDMountainCarEnv()):

    # source task
    if source_qlearn:  # collect data from qlearning = true, collect data from random actions = false
        source_filename = './' + source_env.name + '_dsource_qlearn.npz'
        if os.path.isfile(source_filename):
            f_read = np.load(source_filename)
            dsource = f_read['dsource']

        else:
            model = deepq.models.mlp([64], layer_norm=True)
            act = deepq.learn(source_env,
                              q_func=model,
                              lr=1e-3,
                              max_timesteps=40000,
                              buffer_size=50000,
                              exploration_fraction=0.1,
                              exploration_final_eps=0.1,
                              print_freq=1,
                              param_noise=False)

            replay_memory = []  # reset
            for ep in range(100):  # 100 episodes
                obs, done = source_env.reset(), False
                while not done:
                    n_obs, rew, done, _ = source_env.step(act(obs[None])[0])
                    replay_memory.append(
                        [obs, act(obs[None])[0], n_obs, rew, done])
                    obs = n_obs

            dsource = np.array(replay_memory)
            np.savez(source_filename, dsource=dsource)
            # with open('./data/q_learning.pkl', 'wb') as file:
            #     pickle.dump(qlearning_2d, file)
    else:
        source_filename = './' + source_env.name + '_dsource_random.npz'
        if os.path.isfile(source_filename):
            f_read = np.load(source_filename)
            dsource = f_read['dsource']
        else:
            qlearning_2d = lib.RandomAction.RandomAction(source_env)
            dsource = np.array(qlearning_2d.play())
            np.savez(source_filename, dsource=dsource)

    # target task
    target_filename = './' + target_env.name + '_dtarget_random.npz'
    if os.path.isfile(target_filename):
        f_read = np.load(target_filename)
        # print(f_read['dtarget'].shape)
        dtarget = f_read['dtarget']
    else:
        random_action_3d = lib.RandomAction.RandomAction(target_env)
        dtarget = np.array(random_action_3d.play())
        np.savez(target_filename, dtarget=dtarget)

    # Define the input function for training
    dsa = np.array([np.append(x[0], x[1])
                    for x in dtarget])  # dsa = d states actions
    dns = np.array([x[2] for x in dtarget])  # dns = d next states

    dsa_train = dsa[:-100]
    dns_train = dns[:-100]
    dsa_test = dsa[-100:]
    dns_test = dns[-100:]

    return dsa_train, dns_train, dsa_test, dns_test, dsource, dtarget
Esempio n. 11
0
def train_model(num_steps=10000,
                batch_size=100,
                display_step=100,
                source_env=MountainCarEnv(),
                target_env=ThreeDMountainCarEnv()):
    loss_op, train_op, X, Y = one_step_transition_model(
        num_input=target_env.observation_space.shape[0] + 1,
        num_output=target_env.observation_space.shape[0])
    dsa_train, dns_train, dsa_test, dns_test, dsource, dtarget = get_train_test_data(
        source_qlearn=False, source_env=source_env, target_env=target_env)

    batch_num = np.size(dsa_train, 0)

    init = tf.global_variables_initializer()
    loss = []

    saver = tf.train.Saver()

    # Start training
    with tf.Session() as sess:
        # Run the initializer
        sess.run(init)

        for step in range(num_steps):
            batch_x = dsa_train[(step * batch_size) %
                                batch_num:(step * batch_size + batch_size) %
                                batch_num, :]
            batch_y = dns_train[(step * batch_size) %
                                batch_num:(step * batch_size + batch_size) %
                                batch_num, :]

            # Run optimization op (backprop)
            loss_train, _ = sess.run([loss_op, train_op],
                                     feed_dict={
                                         X: batch_x,
                                         Y: batch_y
                                     })
            if step % display_step == 0:
                print("Step " + str(step) + ", Minibatch Loss= " +
                      "{:.4f}".format(loss_train))
                loss.append(loss_train)

        print("Optimization Finished!")

        # test set
        loss_test = sess.run(loss_op, feed_dict={X: dsa_test, Y: dns_test})
        print("test loss is {}".format(loss_test))

        save_path = saver.save(sess, "./data/tmp/model.ckpt")
        print("Model saved in file: %s" % save_path)

        # Find the mapping between source and target
        source_states = source_env.observation_space.shape[0]  # 2
        target_states = target_env.observation_space.shape[0]  # 4
        source_actions = source_env.action_space.n  # 3
        target_actions = target_env.action_space.n  # 5

        mse_state_mappings = np.zeros(
            (source_states, ) * target_states)  # 2 by 2 by 2 by 2
        mse_action_mappings = np.ndarray(
            shape=(target_actions, source_actions,
                   pow(target_states, source_states)))  # 5 by 3 by 16
        mse_action_mappings.fill(-1)

        state_count = 0

        for target_states_list in itertools.product(range(source_states),
                                                    repeat=target_states):
            state_losses = []
            for t_action in range(target_actions):
                for s_action in range(source_actions):
                    states = np.array(
                        [x[0] for x in dsource if x[1] == s_action])
                    actions = np.array(
                        [x[1] for x in dsource if x[1] == s_action])
                    n_states = np.array(
                        [x[2] for x in dsource if x[1] == s_action])

                    if (states.size == 0) or (actions.size
                                              == 0) or (n_states.size == 0):
                        print(
                            'this happened.. dsource does not have certain states or does not perform certain actions at all. make sure to generate better samples. possibly with high epsilon value'
                        )
                        # mse_action_mappings[t_action, s_action, state_count] = 0
                        continue

                    # transform to dsource_trans
                    actions_trans = np.ndarray(shape=(actions.size, ))
                    actions_trans.fill(t_action)
                    input_trans = np.concatenate(
                        (states[:, target_states_list], actions_trans[:,
                                                                      None]),
                        axis=1)
                    n_states_trans = np.squeeze(
                        np.array([n_states[:, target_states_list]]))

                    # calculate mapping error
                    loss_mapping = sess.run(loss_op,
                                            feed_dict={
                                                X: input_trans,
                                                Y: n_states_trans
                                            })
                    # print('loss_mapping is {}'.format(loss_mapping))

                    state_losses.append(loss_mapping)
                    # import pdb; pdb.set_trace()
                    mse_action_mappings[t_action, s_action,
                                        state_count] = loss_mapping

            # import pdb; pdb.set_trace()
            mse_state_mappings[target_states_list] = np.mean(state_losses)
            state_count += 1

        ## mse_action_mappings_result = [[np.mean(mse_action_mappings[t_action, s_action, :]) for s_action in range(source_actions)] for t_action in range(target_actions)]

        mse_action_mappings_result = np.zeros((target_actions, source_actions))
        for t_action in range(target_actions):
            for s_action in range(source_actions):
                losses_act = []
                for s in range(target_states * target_states):
                    if mse_action_mappings[t_action, s_action, s] != -1:
                        # print(mse_action_mappings[t_action, s_action, s])
                        losses_act.append(mse_action_mappings[t_action,
                                                              s_action, s])
                mse_action_mappings_result[t_action,
                                           s_action] = np.mean(losses_act)

        print('action mapping: {}'.format(mse_action_mappings_result))
        print('state mapping {}'.format(mse_state_mappings))

        count = 0
        for target_states_list in itertools.product(range(source_states),
                                                    repeat=target_states):
            print(str(count) + ': ')
            print(mse_state_mappings[target_states_list])
            count += 1

        with open('./data/mse_state_mappings_3d_2d.pkl', 'wb') as file:
            pickle.dump(mse_state_mappings, file)

        with open('./data/mse_action_mappings_3d_2d.pkl', 'wb') as file:
            pickle.dump(mse_action_mappings, file)

        print("Done exporting MSE file")
Esempio n. 12
0
                mse_action_mappings_result[t_action,
                                           s_action] = np.mean(losses_act)

        print('action mapping: {}'.format(mse_action_mappings_result))
        print('state mapping {}'.format(mse_state_mappings))

        count = 0
        for target_states_list in itertools.product(range(source_states),
                                                    repeat=target_states):
            print(str(count) + ': ')
            print(mse_state_mappings[target_states_list])
            count += 1

        with open('./data/mse_state_mappings_3d_2d.pkl', 'wb') as file:
            pickle.dump(mse_state_mappings, file)

        with open('./data/mse_action_mappings_3d_2d.pkl', 'wb') as file:
            pickle.dump(mse_action_mappings, file)

        print("Done exporting MSE file")


if __name__ == '__main__':
    # train_model(num_steps=10000, batch_size=100, display_step=100, source_env=MountainCarEnv(),
    #             target_env=ThreeDMountainCarEnv())
    train_model(num_steps=10000,
                batch_size=100,
                display_step=100,
                source_env=ThreeDMountainCarEnv(),
                target_env=MountainCarEnv())