from gym.spaces import Discrete, Box

from ourgym import RelativeDiscreteActionMap
from ourgym.RobotArmInvPendulum import SingleMotorActionMap
from simulation.robot_arm_simulation import RobotArmEnvironment
from rl import DQNAgent, ACAgent
from time import sleep, time
import numpy as np

number_of_episodes = 10000
max_iterations_per_episode = 500

if __name__ == '__main__':

    agent = DQNAgent(6, 9, 10000, 1.0, 0.05, 9000, 0.99, 0.00001, 2, (10, 10),
                     1000)
    # agent.epsilon = 0.05
    # agent.load('backup/weights_1515613961.468759')

    with RobotArmEnvironment(sim_ticks_per_step=15) as env:
        # FOR ACCELERATION CONTROL
        # env.action_space = Discrete(9)
        # env.action_map = RelativeDiscreteActionMap(9, -100, 101, 100)
        # env.observation_space = Box(np.array([0, -1, 0, -1, 0, -1]), np.array([1, 1, 1, 1, 1, 1]))

        # FOR SINGLE MOTOR CONTROL
        env.action_space = Discrete(9)
        env.action_map = SingleMotorActionMap(9, 45, 135)
        env.observation_space = Box(np.array([0, -1, 0, -1, 0, -1]),
                                    np.array([1, 1, 1, 1, 1, 1]))

if __name__ == '__main__':

    env = gym.make('Pendulum-v0')
    observation = env.reset()

    print(observation, type(observation), observation.shape)
    print(env.action_space)
    print(env.action_space.sample)

    dim_action = 40
    dim_state = 3

    am = ActionMap(dim_action)
    agent = DQNAgent(dim_state, dim_action, am)

    high = np.array([1., 1., 8.])
    low = -high

    #print("mean 100 episode reward before learning: {}".format(calculate_mean_reward(agent, env)))

    episodes = 1000
    for i in range(episodes):
        print(i)
        observation = env.reset()

        while True:
            env.render(mode="human")
            action = agent.act(observation)
            new_observation, reward, done, info = env.step(
Esempio n. 3
0
def run_experiments(index):
    # changes reward and done function
    task_index = index
    random_run = True

    # common parameters
    num_episodes = 5000
    num_steps = 200
    memory_size = 10000
    batch_size = 64
    e_start = 1.0
    e_finish = 0.05
    e_decay_steps = 4500
    dr = 0.995
    lr = 0.0001
    layers = 2
    nodes = (20, 20)
    frequency_updates = 0

    # if index % 2 == 0:
    #     task_index = 1
    #     if index >= 4:
    #         num_episodes = 20000
    #         e_decay_steps = 18000
    # else:
    #     task_index = 2
    #     if index >= 4:
    #         num_episodes = 20000
    #         e_decay_steps = 18000

    while True:
        # create directory if it does not exist
        directory_path = "../experiments_{}_{}/{}_{}/".format(
            task_index, "random" if random_run else "",
            datetime.now().strftime("%d-%m-%Y_%H-%M-%S"), uuid.uuid4())
        if not os.path.exists(os.path.dirname(directory_path)):
            try:
                os.makedirs(os.path.dirname(directory_path))
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    raise

        try:
            nr_actions_per_motor = 9
            lower_bound = 45
            upper_bound = 135
            simulation_init_state = (0, 0, np.pi, 0, np.pi, 0)
            reset_with_noise = False
            if task_index == 1:
                nr_actions_per_motor = 9
                lower_bound = 70
                upper_bound = 110
                simulation_init_state = (np.pi, 0, np.pi, 0, np.pi, 0)
                reset_with_noise = True
            elif task_index == 2:
                nr_actions_per_motor = 5
                lower_bound = 45
                upper_bound = 135
                simulation_init_state = (0, 0, np.pi, 0, np.pi, 0)
                reset_with_noise = False

            env = RobotArmEnvironment(
                reward_function_index=task_index,
                done_function_index=task_index,
                simulation_init_state=simulation_init_state,
                reset_with_noise=reset_with_noise,
                sim_ticks_per_step=6)
            env.action_space = Discrete(nr_actions_per_motor**2)
            env.action_map = AbsoluteDiscreteActionMap(lower_bound,
                                                       upper_bound,
                                                       nr_actions_per_motor)

            state_dim = env.observation_space.shape[0]
            action_dim = env.action_space.n

            agent = DQNAgent(env, state_dim, action_dim, memory_size, e_start,
                             e_finish, e_decay_steps, dr, lr, layers, nodes,
                             frequency_updates)

            run(env, agent, num_episodes, num_steps, batch_size,
                directory_path, random_run)
        except KeyboardInterrupt as e:
            # for f in os.listdir(os.path.dirname(directory_path)):
            #     if re.search(file_path, f):
            #         os.remove(os.path.join(directory_path, f))
            break
Esempio n. 4
0
def run(env: RobotArmEnvironment,
        agent: DQNAgent,
        num_episodes: int,
        max_num_steps: int,
        batch_size: int,
        directory_path: str,
        random_run: bool = False):

    reward_history_file_name = directory_path + "reward.csv"
    action_history_file_name = directory_path + "action.csv"
    max_q_history_file_name = directory_path + "max-q.csv"
    state_history_file_name = directory_path + "state.csv"

    # Parse these files with:
    # with open(file_name, "r") as f:
    #     reader = csv.reader(f, delimiter=" ")
    #     for row in reader:
    #         for col in row:
    #             col = ast.literal_eval(col) # (nan values have to be checked for)

    previous = time.time()
    for episode_idx in range(num_episodes):
        state = env.reset()

        for step_idx in range(max_num_steps):
            with open(state_history_file_name, "a") as f:
                f.write(
                    ("(" + ("{}," * 6) + ") ").format(*env.simulation.state))

            if episode_idx % 100 == 0:
                env.render()
                time.sleep(1 / 10)

            # take an action
            if random_run:
                action = env.action_space.sample()
            else:
                max_q, action, prediction = agent.act(state)

            if not random_run:
                with open(max_q_history_file_name, "a") as f:
                    f.write("{} ".format(max_q))

            with open(action_history_file_name, "a") as f:
                f.write("({},{}) ".format(
                    env.action_map.get(int(action))[0],
                    env.action_map.get(int(action))[1]))

            # observe effect of action and remember
            new_state, reward, done, info = env.step(action)

            if not random_run:
                agent.remember(state, action, reward, new_state, done)

            with open(reward_history_file_name, "a") as f:
                f.write("{} ".format(float(reward)))

            # store new state
            state = new_state

            if done:
                break

        if not random_run:
            agent.replay(batch_size)

        # new line in all data files
        with open(action_history_file_name, "a") as f:
            f.write("\n")
        with open(reward_history_file_name, "a") as f:
            f.write("\n")
        if not random_run:
            with open(max_q_history_file_name, "a") as f:
                f.write("\n")
        with open(state_history_file_name, "a") as f:
            f.write(("(" + ("{}," * 6) + ") \n").format(*env.simulation.state))

        if not random_run and episode_idx % 50 == 0:
            agent.save(directory_path + "weights-ep-{}".format(episode_idx))

        current = time.time()
        print("{}: episode {:3}/{:3} completed in {:4}s".format(
            os.getpid(), episode_idx, num_episodes, current - previous))
        previous = current
Esempio n. 5
0
# ensure files are downloaded
if not os.path.isfile(CVAE_DATA_PATH):
    download_blob(CVAE_DATA_BLOB_NAME, CVAE_DATA_PATH)
if not os.path.isfile(CVAE_MODEL_PATH):
    download_blob(CVAE_MODEL_BLOB_NAME, CVAE_MODEL_PATH)
if not os.path.isfile(FULL_RL_MODEL_PATH):
    download_blob(FULL_RL_MODEL_BLOB_NAME, FULL_RL_MODEL_PATH)

# load files
with open(CVAE_DATA_PATH, 'rb') as f:
    CVAE_DATA = pickle.load(f)
CVAE_MODEL = CVAE(data_dim=EMBEDDING_DIM * 2,
                  label_dim=9,
                  model_path=CVAE_MODEL_PATH)
FULL_RL_MODEL = DQNAgent(action_size=3, load_model=True, no_op_steps=0)


def simple_sample(n_real, n_fake):
    '''
    Generates a mixed dataset of simulated and real embedded samples. 
    Samples are "embedded" because we've used transfer learning. 
    Sampling is "simple" because the GAN is not fit with each simple. 
    '''
    ## sample real data
    real_data = []
    if n_real > 0:
        real_data = __sample_real_data(n_real)
    ## sample fake data
    fake_data = []
    if n_fake > 0:
    def get(self, index):
        return index

    def getIndex(self, action):
        return action


if __name__ == '__main__':

    env = gym.make('BipedalWalker-v2')
    observation = env.reset()

    print(observation, type(observation), observation.shape)
    print(env.action_space)
    print(env.action_space.sample)

    agent = DQNAgent(24, 4, ActionMap())

    while True:
        env.render(mode="human", close=False)

        action = env.action_space.sample()
        observation, reward, done, info = env.step(env.action_space.sample())

        print(action, reward, done)

        if done:
            break

        time.sleep(1 / 60)
Esempio n. 7
0
    e_start = 1
    e_finish = 0.05
    e_decay = 400
    dr = 0.99
    lr = 0.00001
    layers = 2
    nodes = 20
    frequency_updates = 0

    agent = DQNAgent(
        env,
        state_dim,
        action_dim,
        memory_size,
        e_start,
        e_finish,
        e_decay,
        dr,
        lr,
        layers,
        (nodes, nodes),
        frequency_updates,
    )

    for episode in range(num_episodes):
        state = env.reset()
        tr = 0

        for step in range(num_steps):
            action = agent.act(state)[1]

            print(step, flush=True, end=" ")
Esempio n. 8
0
"""
A simple example to run the DQN algorithm on a toy example.
"""
import gym
import tensorflow as tf
from rl import DQNAgent
from keras.layers import Dense, Input, merge, Activation, Flatten
from keras.models import Model

env_name = 'CartPole-v0'
num_actions = 2


def make_model():
    i = Input((4, ))
    x = i
    x = Dense(128, activation='relu')(x)
    policy = Dense(num_actions, activation='softmax')(x)
    value = Dense(1, activation='linear')(x)
    return Model([i], [value])


with tf.Session() as sess, tf.device('/cpu:0'):
    agent = DQNAgent(make_model)
    agent.compile(sess)
    agent.train(sess, lambda: gym.make('CartPole-v0'))
def run_experiments(reward_index):
    if reward_index < 0 or reward_index > 1:
        raise ValueError()

    num_episodes = number_of_episodes
    num_steps = max_iterations_per_episode
    batchsize = 32
    state_size = 6
    action_size = 81
    memory_size = 100000
    epsilon_start = 1
    epsilon_min = 0.1
    epsilon_decay_per_step = 10000
    lr = 0.00001
    dr = 0.99
    amount_layers = 2
    amount_nodes_layer = 40
    frequency_updates = 1000

    parameters = {}
    parameters['num_episodes'] = num_episodes
    parameters['num_steps'] = num_steps
    parameters['batchsize'] = batchsize
    parameters['state_size'] = state_size
    parameters['action_size'] = action_size
    parameters['memory_size'] = memory_size
    parameters['epsilon_start'] = epsilon_start
    parameters['epsilon_min'] = epsilon_min
    parameters['epsilon_decay_episodes_required'] = epsilon_decay_per_step
    parameters['learning_rate'] = lr
    parameters['discount_rate'] = dr
    parameters['amount_layers'] = amount_layers
    parameters['amount_nodes_layer'] = amount_nodes_layer
    parameters['frequency_update_target_model'] = frequency_updates

    agent = DQNAgent(6, 81, num_steps, epsilon_start, epsilon_min,
                     epsilon_decay_per_step, dr, lr, amount_layers,
                     (amount_nodes_layer, amount_nodes_layer),
                     frequency_updates)

    with RobotArmEnvironment(reward_function_index=reward_index,
                             reward_function_params=(1 / 6 * np.pi, 2 * np.pi,
                                                     1, 10, 0.05, 0.1, 2,
                                                     0.001, 1)) as env:

        ah = list()
        rh = list()

        for episode_idx in range(number_of_episodes):
            state = env.reset()
            tr = 0
            ct = time.time()

            ah.append(list())
            rh.append(list())

            for i in range(max_iterations_per_episode):
                action = agent.act(state)
                ah[episode_idx].append(env.action_map.get(int(action)))

                next_state, reward, done, _ = env.step(action)
                rh[episode_idx].append(float(reward))

                agent.remember(state, action, reward, next_state, done)

                state = next_state
                tr += reward

                if done:
                    break

                agent.replay(32)

            print(
                "episode {}/{}, average reward {}, epsilon {}, time taken {}s".
                format(episode_idx + 1, number_of_episodes, tr,
                       agent.get_epsilon(),
                       time.time() - ct))

            agent._update_epsilon()

            if episode_idx % 100 == 0 and episode_idx != 0:
                agent.safe()
                save_info(episode_idx, reward_index, parameters,
                          env.action_map.to_json_object(), rh, ah,
                          env.to_json_object())