Ejemplo n.º 1
0
def question_4():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 50
    max_eps_steps = 1000000

    steps = np.zeros([num_runs, num_episodes])
    rewards = []
    for r in range(num_runs):
        print("run number : ", r + 1)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
        reward = rlglue.total_reward()
        rewards.append(reward)
    mean = sum(rewards) / len(rewards)
    stder = statistics.stdev(rewards) / math.sqrt(len(rewards))
    print("mean:", mean)
    print("std:", stder)
    np.save('bonus_steps', steps)
    np.save("mean", mean)
    np.save("stder", stder)
Ejemplo n.º 2
0
def better_run_experiment(num_runs, num_episodes):
    # Same as last function
    all_steps = []
    agent = ag.ExpectedSarsaAgent
    env = enviro.MountainEnvironment

    total_reward_per_run = []
    for run in range(num_runs):
        start = time.time()
        if run % 5 == 0:
            print("RUN: {}".format(run))

        initial_weights = np.random.uniform(-0.001, 0)
        agent_info = {
            "num_tilings": 32,
            "num_tiles": 4,
            "iht_size": 4096,
            "epsilon": 0.1,
            "gamma": 1,
            "alpha": 0.7 / 32,
            "initial_weights": initial_weights,
            "num_actions": 3
        }
        env_info = {
            "min_position": -1.2,
            "max_position": 0.5,
            "min_velocity": -0.07,
            "max_velocity": 0.07,
            "gravity": 0.0025,
            "action_discount": 0.001
        }
        rl_glue = RLGlue(env, agent)
        rl_glue.rl_init(agent_info, env_info)
        steps_per_episode = []

        for episode in range(num_episodes):
            # 15000 is max steps of the episode
            rl_glue.rl_episode(15000)
            steps_per_episode.append(rl_glue.num_steps)
        total_reward = np.sum(steps_per_episode) * -1
        all_steps.append(np.array(steps_per_episode))
        print("Run time: {}".format(time.time() - start))
        total_reward_per_run.append(total_reward)

    data = np.mean(total_reward_per_run)
    data_std_err = np.std(total_reward_per_run, axis=0) / np.sqrt(num_runs)
    plt.title("Expected Sarsa MountainCar (Alternate Parameters)",
              fontdict={
                  'fontsize': 16,
                  'fontweight': 25
              },
              pad=15.0)
    plt.xlabel("Epsiode", labelpad=5.0)
    plt.ylabel("Steps per Episode (averaged over " + str(num_runs) + " runs)",
               labelpad=10.0)
    plt.plot(np.mean(np.array(all_steps), axis=0))
    plt.show()
    np.save("ExpectedSarsa_test", np.array(all_steps))
    print("mean: ", data)
    print("standard error: ", data_std_err)
Ejemplo n.º 3
0
def run_experiment(env_info,
                   agent_info,
                   num_episodes=5000,
                   experiment_name=None,
                   plot_freq=100,
                   true_values_file=None,
                   value_error_threshold=1e-8):
    env = CliffWalkEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)

    manager = Manager(env_info,
                      agent_info,
                      true_values_file=true_values_file,
                      experiment_name=experiment_name)
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0)  # no step limit
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            manager.visualize(values, episode)

    values = rl_glue.agent.agent_message("get_values")
    if true_values_file is not None:
        # Grading: The Manager will check that the values computed using your TD agent match
        # the true values (within some small allowance) across the states. In addition, it also
        # checks whether the root mean squared value error is close to 0.
        manager.run_tests(values, value_error_threshold)

    return values
Ejemplo n.º 4
0
def run_experiment(env_info,
                   agent_info,
                   num_episodes=5000,
                   experiment_name=None,
                   plot_freq=100,
                   true_values_file=None,
                   value_error_threshold=1e-8):
    env = CliffWalkEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)

    manager = Manager(env_info,
                      agent_info,
                      true_values_file=true_values_file,
                      experiment_name=experiment_name)
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0)  # no step limit
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            manager.visualize(values, episode)

    values = rl_glue.agent.agent_message("get_values")
    return values
Ejemplo n.º 5
0
def question_3():
    # Specify hyper-parameters
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 1000000
    for _ in range(num_runs):
        rlglue.rl_init()
        i = 0
        for i in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            print(i)
    fout = open('value', 'w')
    steps = 50
    w, iht = rlglue.rl_agent_message("ValueFunction")
    Q = np.zeros([steps, steps])
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(3):
                value = 0
                for index in tiles(iht, 8, [
                        8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 *
                    (-0.07 + (j * 0.14 / steps)) / 0.14
                ], [a]):
                    value -= w[index]
                values.append(value)
            height = max(values)
            fout.write(repr(height) + ' ')
            Q[j][i] = height
        fout.write('\n')
    fout.close()
    np.save("value", Q)
Ejemplo n.º 6
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    rl_glue = RLGlue(environment, agent)
        
    # save sum of reward at the end of each episode
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}

    agent_info = agent_parameters

    # one agent setting
    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        env_info["seed"] = run

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):
            # run episode
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward
    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')
Ejemplo n.º 7
0
def foreground(servos, time_steps, episodes, plotting_data):
    my_agent = DanceBot(plotting_data)
    my_env = ServoEnvironment(servos)
    my_rl_glue = RLGlue(my_env, my_agent)

    print("\nRunning {} episodes with {} time-steps each.".format(episodes,
                                                                  time_steps))
    for _ in range(episodes):
        my_rl_glue.rl_episode(time_steps)
Ejemplo n.º 8
0
def foreground(servos, time_steps, episodes, plotting_data):
    my_agent = DanceBot(plotting_data)
    my_env = ServoEnvironment(servos)
    my_rl_glue = RLGlue(my_env, my_agent)

    print("\nRunning {} episodes with {} time-steps each.".format(
        episodes, time_steps))
    for _ in range(episodes):
        my_rl_glue.rl_episode(time_steps)
Ejemplo n.º 9
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 50
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            #print("Episode number: "+str(e))
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            #print("Number of steps: "+str(steps))
            # print(steps[r, e])
    np.save('steps', steps)
    plotGraph()
    
    del agent, environment, rlglue
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            print("Episode number: "+str(e))
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            #print("Number of steps: "+str(steps))
            # print(steps[r, e])
    #np.save('steps', steps)
    #plotGraph()
    rlglue.rl_agent_message("plot3DGraph")
Ejemplo n.º 10
0
def question_3():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    max_eps_steps = 100000
    num_episodes = 1000
    num_runs = 1
    numActions=3

    rlglue.rl_init()
    for e in range(num_episodes):
        rlglue.rl_episode(max_eps_steps)

    weights = rlglue.rl_agent_message("3D plot of the cast-to-go")

    fout = open('value','w')
    steps = 50
    z = np.zeros((50,50))
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(numActions):
                tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14]
                inds =  agent.get_index(tile,a)
                values.append(np.sum([weights[i] for i in inds]))
            height = max(values)
            z[j][i]=-height
            fout.write(repr(-height)+' ')
        fout.write('\n')
    fout.close()

    fig = plt.figure()
    ax = fig.add_subplot(111,projection ='3d')
    x = np.arange(-1.2,0.5,1.7/50)
    y = np.arange(-0.07,0.07,0.14/50)
    x,y = np.meshgrid(x,y)
    ax.set_xticks([-1.2, 0.5])
    ax.set_yticks([0.07, -0.07])
    ax.set_ylabel('Velocity')
    ax.set_xlabel('Position')
    ax.set_zlabel('Cost-To-Go')
    ax.plot_surface(x,y,z)
    plt.savefig('cost-to-go.png')
    plt.show()
    np.save('steps', steps)
Ejemplo n.º 11
0
def tiling(real_value):
    environment = RandomWalkEnvironment()
    agent = Agent2()
    rl = RLGlue(environment, agent)
    error = np.zeros(5000)
    for run in range(30):
        rl.rl_init()
        for episode in range(2000):
            rl.rl_episode(10000)
            estimate = rl.RL_agent_message("ValueFunction")
            error[episode] += np.sqrt(
                np.mean(np.power(real_value - estimate, 2)))

        rl.RL_cleanup()
    file2 = open("tiling_output.txt", "w")
    for i in range(2000):
        file2.write(format(error[i] / 10))
    file2.close()
Ejemplo n.º 12
0
def question_1(num_episodes):
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    max_eps_steps = 100000

    steps = np.zeros(num_episodes)

    rlglue.rl_init()
    for e in tqdm(range(num_episodes)):
        rlglue.rl_episode(max_eps_steps)
        steps[e] = rlglue.num_ep_steps()
        # print(steps[e])

    return steps
Ejemplo n.º 13
0
def question_1():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 5
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            # print(steps[r, e])
    np.save('steps', steps)
Ejemplo n.º 14
0
Archivo: main.py Proyecto: healqq/lumia
def run_experiment(env_info, agent_info, 
                   num_episodes=5000,
                   value_error_threshold=1e-8,
                   plot_freq=10):
    env = GridEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)
    steps = []
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0) # no step limit
        steps.append(rl_glue.agent.agent_message("get_steps"))
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            print(rl_glue.environment.env_message("get_grid_state"))
        rl_glue.rl_cleanup()
    values = rl_glue.agent.agent_message("get_values")
    
    return [values, steps]
Ejemplo n.º 15
0
def question_2():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    max_eps_steps = 100000
    num_episodes = 1000

    rlglue.rl_init()
    for _ in tqdm(range(num_episodes)):
        rlglue.rl_episode(max_eps_steps)

    q3_plot = rlglue.rl_agent_message("plot")

    fig = plt.figure()
    ax = fig.gca(projection='3d')
    X, Y = np.meshgrid(q3_plot[0], q3_plot[1])
    surf = ax.plot_surface(X, Y, q3_plot[2])
    ax.set_xlim(q3_plot[0][0], q3_plot[0][-1])
    ax.set_ylim(q3_plot[1][0], q3_plot[1][-1])
    plt.show()
def question_3():

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])
    # only 1 run
    for r in range(num_runs):
        print("1000 episode run : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
        # get the list of value functions [X,Y,Z] represents position, velocity, state-value
        Return = rlglue.rl_agent_message(1)
    return Return
Ejemplo n.º 17
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    num_action = 3

    for r in range(num_runs):
        print("run number : ", r)
        #np.random.seed(r)
        rlglue.rl_init()
        for _ in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
        weight = rlglue.rl_agent_message('get weight')

    # algorithm from assignment
    #fout = open('value','w')
    steps = 50
    neg_q_hat = np.zeros((steps, steps))
    for i in range(steps):
        for j in range(steps):
            values = []
            position = -1.2 + (i * 1.7 / steps)
            velocity = -0.07 + (j * 0.14 / steps)
            for a in range(num_action):
                tile_idx = agent.plot_get_feature(position, velocity, a)
                q_hat = np.sum(weight[tile_idx])
                values.append(q_hat)
            height = np.max(values)
            neg_q_hat[j][i] = -height
            #fout.write(repr(-height)+' ')
        #fout.write('\n')
    #fout.close()
    np.save('neg_q_hat', neg_q_hat)
def part3():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            # print(steps[r, e])
    np.save('steps', steps)
    fout = open('value', 'w')
    steps = 50
    num_of_actions = 3
    for i in range(steps):
        for j in range(steps):
            q = []
            for a in range(num_of_actions):
                pos = -1.2 + (1 * 1.7 / steps)
                vel = -0.07 + (j * 0.14 / steps)
                tile = (pos, vel)
                inds = Agent.F(tile, self.action)
                q.append(np.sum(self.weights[inds]))
            height = max(q)
            fout.write(repr(-height) + '')
        fout.write('\n')
    fout.close()
    np.save('heights', height)
    np.save('steps', steps)
Ejemplo n.º 19
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)
        
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}
    agent_info = agent_parameters

    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        agent_info["network_pickle"] = "network500.pickle"
        env_info["seed"] = run
        env_info["render"] = True

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward
Ejemplo n.º 20
0
def question_3():
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    for r in range(num_runs):
        start = time.time()
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
    end = time.time()
    print(str(end - start) + " seconds elapsed")
    action_vals, pos, vel = rlglue.rl_agent_message("return info")
    action_vals = np.multiply(action_vals, -1)
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_surface(pos, vel, action_vals)
    plt.show()
Ejemplo n.º 21
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 50
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        st = time.time()
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            # print(steps[r, e])
        finish = time.time() - st
        print(str(finish) + " seconds elapsed")
    np.save('steps', steps)
Ejemplo n.º 22
0
from rl_glue import RLGlue
from mountain_car_env import MountainCarEnvironment
from sarsa_agent import SarsaAgent

num_runs = 10
num_episodes = 300
env_info = {"num_tiles": 8, "num_tilings": 8}
agent_info = {}
all_steps = []

agent = SarsaAgent
env = MountainCarEnvironment

for run in range(num_runs):
    rl_glue = RLGlue(env, agent)
    rl_glue.rl_init(agent_info, env_info)

    for episode in range(num_episodes + 1):
        rl_glue.rl_episode(15000)
        r = rl_glue.rl_agent_message("get_reward")
        print("episode:", episode, "reward:", r)
Ejemplo n.º 23
0
        v_over_runs[episode] = []

    for run in range(num_runs):
        # set seed for reproducibility
        np.random.seed(run)
        # initialize RL-Glue
        rlglue.rl_init()
        total_steps = []
        if run == 0:
            rlglue._agent.set_num_of_actions(8)
            print("running sarsa agent with 8 actions...")
        else:
            rlglue._agent.set_num_of_actions(9)
            print("running sarsa agent with 9 actions...")

        # loop over episodes
        for episode in range(num_episodes):
            # run episode with the allocated steps budget
            rlglue.rl_episode(max_steps)
            total_steps.append(rlglue._environment.get_total_steps())

        # plot
        p1, = plt.plot(total_steps, range(1, 171))
        plt.ylabel('Episodes')
        plt.xlabel('Time Steps')
        if run == 0:
            plt.title('Performace of Sarsa Agent with 8 Actions')
        else:
            plt.title('Performace of Sarsa Agent with 9 Actions')
        plt.show()
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # Sweep Agent parameters
    for num_agg_states in agent_parameters["num_groups"]:
        for step_size in agent_parameters["step_size"]:

            # save rmsve at the end of each evaluation episode
            # size: num_episode / episode_eval_frequency + 1 (includes evaluation at the beginning of training)
            agent_rmsve = np.zeros(
                int(experiment_parameters["num_episodes"] /
                    experiment_parameters["episode_eval_frequency"]) + 1)

            # save learned state value at the end of each run
            agent_state_val = np.zeros(environment_parameters["num_states"])

            env_info = {
                "num_states":
                environment_parameters["num_states"],
                "start_state":
                environment_parameters["start_state"],
                "left_terminal_state":
                environment_parameters["left_terminal_state"],
                "right_terminal_state":
                environment_parameters["right_terminal_state"]
            }

            agent_info = {
                "num_states": environment_parameters["num_states"],
                "num_groups": num_agg_states,
                "step_size": step_size,
                "discount_factor": environment_parameters["discount_factor"]
            }

            print('Setting - num. agg. states: {}, step_size: {}'.format(
                num_agg_states, step_size))
            os.system('sleep 0.2')

            # one agent setting
            for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)):
                env_info["seed"] = run
                agent_info["seed"] = run
                rl_glue.rl_init(agent_info, env_info)

                # Compute initial RMSVE before training
                current_V = rl_glue.rl_agent_message("get state value")
                agent_rmsve[0] += calc_RMSVE(current_V)

                for episode in range(1, experiment_parameters["num_episodes"] +
                                     1):
                    # run episode
                    rl_glue.rl_episode(0)  # no step limit

                    if episode % experiment_parameters[
                            "episode_eval_frequency"] == 0:
                        current_V = rl_glue.rl_agent_message("get state value")
                        agent_rmsve[int(
                            episode /
                            experiment_parameters["episode_eval_frequency"]
                        )] += calc_RMSVE(current_V)

                # store only one run of state value
                if run == 50:
                    agent_state_val = rl_glue.rl_agent_message(
                        "get state value")

            # rmsve averaged over runs
            agent_rmsve /= experiment_parameters["num_runs"]

            save_name = "{}_agg_states_{}_step_size_{}".format(
                'TD_agent', num_agg_states, step_size).replace('.', '')

            if not os.path.exists('results'):
                os.makedirs('results')

            # save avg. state value
            np.save("results/V_{}".format(save_name), agent_state_val)

            # save avg. rmsve
            np.save("results/RMSVE_{}".format(save_name), agent_rmsve)
Ejemplo n.º 25
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # save sum of reward at the end of each episode
    agent_sum_reward = []
    average_sum_reward = []

    env_info = environment_parameters
    agent_info = agent_parameters

    rl_glue.rl_init(agent_info, env_info)

    starting_episode = 0

    gym_name = env_info['gym_environment']
    agent_name = agent_info['name']
    save_name = "{}.npy".format(rl_glue.agent.name)
    npy_path = os.path.join(rl_glue.agent.checkpoint_dir,
                            "sum_reward_{}".format(save_name))
    fig_path = os.path.join(rl_glue.agent.checkpoint_dir, 'sum_rewards.png')

    # load checkpoint if any
    if experiment_parameters['load_checkpoint'] is not None:
        rl_glue.agent.load_checkpoint(experiment_parameters['load_checkpoint'])
        agent_sum_reward, average_sum_reward = np.load(npy_path)
        agent_sum_reward = list(agent_sum_reward)
        average_sum_reward = list(average_sum_reward)
        fname = experiment_parameters['load_checkpoint'].split(os.path.sep)[-1]
        try:
            starting_episode = int(fname.split('_')[1])
        except IndexError:
            starting_episode = len(agent_sum_reward)

        print(f"starting from episode {starting_episode}")

    for episode in tqdm(
            range(1 + starting_episode,
                  experiment_parameters["num_episodes"] + 1)):
        # run episode
        rl_glue.rl_episode(experiment_parameters["timeout"])

        episode_reward = rl_glue.rl_agent_message("get_sum_reward")
        agent_sum_reward.append(episode_reward)
        if episode % experiment_parameters['print_freq'] == 0:
            print('Episode {}/{} | Reward {}'.format(
                episode, experiment_parameters['num_episodes'],
                episode_reward))

        average = get_average(agent_sum_reward)
        average_sum_reward.append(average)

        if episode % experiment_parameters['checkpoint_freq'] == 0:
            rl_glue.agent.save_checkpoint(episode)
            savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path,
                    agent_name, gym_name)

        if env_info['solved_threshold'] is not None and average >= env_info[
                'solved_threshold']:
            print("Task Solved with reward = {}".format(episode_reward))
            rl_glue.agent.save_checkpoint(episode, solved=True)
            break

    savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path,
            agent_name, gym_name)
Ejemplo n.º 26
0
import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":
    max_steps = 8000
    num_runs = 1

    # Create and pass agent and environment objects to RLGlue
    environment = WindygridEnvironment()
    agent = SarsaAgent()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore
    for run in range(num_runs):
        episode=[]
        time_step=[]
        rlglue.rl_init()
        while True:
            rlglue.rl_episode()
            time_step.append(rlglue.num_steps())
            episode.append(rlglue.num_episodes())
            if rlglue.num_steps() > 8000:
                break

    plt.plot(time_step,episode,label="8 actions")
    plt.xticks([0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000])
    plt.xlabel('Time steps')
    plt.ylabel('Episode', rotation=90)
    plt.legend(loc=2)
    plt.show()
        # save average value function numpy object, to be used by plotting script
Ejemplo n.º 27
0
    for algorithm in ["Expected Sarsa", "Q-learning"]:
        all_reward_sums[algorithm] = []
        all_state_visits[algorithm] = []
        for run in tqdm(range(num_runs)):
            agent_info["seed"] = run
            rl_glue = RLGlue(env, agents[algorithm])
            rl_glue.rl_init(agent_info, env_info)

            reward_sums = []
            state_visits = np.zeros(agent_info["num_states"])
            #         last_episode_total_reward = 0
            for episode in range(num_episodes):
                start_time = time.clock()
                if episode < num_episodes - 10:
                    # Runs an episode
                    rl_glue.rl_episode(0)
                else:
                    # Runs an episode while keeping track of visited states
                    state, action = rl_glue.rl_start()
                    state_visits[state] += 1
                    is_terminal = False
                    while not is_terminal:
                        # # stop the program
                        # line = sys.stdin.readline()
                        # print 'line=', line
                        # if line == 'q':
                        #     sys.exit()
                        reward, state, action, is_terminal = rl_glue.rl_step()
                        state_visits[state] += 1

                reward_sums.append(rl_glue.rl_return())
Ejemplo n.º 28
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    """
    Assume environment_parameters dict contains:
    {
        input_dim: integer,
        num_actions: integer,
        discount_factor: float
    }
    
    Assume agent_parameters dict contains:
    {
        step_size: 1D numpy array of floats,
        tau: 1D numpy array of floats
    }
    
    Assume experiment_parameters dict contains:
    {
        num_runs: integer,
        num_episodes: integer
    }    
    """
    
    ### Instantiate rl_glue from RLGlue    
    rl_glue = RLGlue(environment, agent)

    os.system('sleep 1') # to prevent tqdm printing out-of-order
        
     ### Initialize agent_sum_reward to zero in the form of a numpy array 
    # with shape (number of values for tau, number of step-sizes, number of runs, number of episodes)
    agent_sum_reward = np.zeros((len(agent_parameters["tau"]), len(agent_parameters["step_size"]), experiment_parameters["num_runs"], experiment_parameters["num_episodes"]))
    
    # for loop over different values of tau
    # tqdm is used to show a progress bar for completing the parameter study
    for i in tqdm(range(len(agent_parameters["tau"]))):
    
        # for loop over different values of the step-size
        for j in range(len(agent_parameters["step_size"])): 

            ### Specify env_info 
            env_info = {}

            ### Specify agent_info
            agent_info = {"num_actions": environment_parameters["num_actions"],
                          "input_dim": environment_parameters["input_dim"],
                          "discount_factor": environment_parameters["discount_factor"],
                          "tau": agent_parameters["tau"][i],
                          "step_size": agent_parameters["step_size"][j]}

            # for loop over runs
            for run in range(experiment_parameters["num_runs"]): 
                
                # Set the seed
                agent_info["seed"] = agent_parameters["seed"] * experiment_parameters["num_runs"] + run
                
                # Beginning of the run            
                rl_glue.rl_init(agent_info, env_info)

                for episode in range(experiment_parameters["num_episodes"]): 
                    
                    # Run episode
                    rl_glue.rl_episode(0) # no step limit

                    ### Store sum of reward
                    agent_sum_reward[i, j, run, episode] = rl_glue.rl_agent_message("get_sum_reward")

            if not os.path.exists('results'):
                    os.makedirs('results')

            save_name = "{}".format(rl_glue.agent.name).replace('.','')

            # save sum reward
            np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
                else:
                    raise Exception('Invalid agent name in exp')
                i = 0
                for key in sorted_params_names:
                    msg_to_send = key + ' ' + str(grid_search_params[key][combination[i]])
                    print(msg_to_send)
                    agent.agent_message(msg_to_send)
                    i += 1
                rl_glue = RLGlue(agent_obj=agent, env_obj=env)
                for r in range(int(initial_params['num_runs'])):
                    print('run: ' + str(r))
                    agent.random_seed = r
                    rl_glue.rl_init()
                    agent.epsilon = float(initial_params['initial_epsilon'])
                    for e in range(int(initial_params['total_episodes'])):
                        rl_glue.rl_episode(max_episode_steps)
                        agent.epsilon = compute_epsilon(current_epsilon=agent.epsilon)
                        episodes_steps[j, r, e] = rl_glue.num_ep_steps()
                        Q_t[j, r, e] = agent.Q
                        if initial_params['agent'] != 'sarsa0':
                            Phi_t[j, r, e] = agent.Phi
                        if 'pies' in initial_params['agent']:
                            agent.xi = compute_xi(current_xi=agent.xi, decay=agent.decay,
                                                  decay_param=agent.decay_param)
                    print('path length', len(agent.path))
                j += 1

            # finding the best parameter setting for grid search based on AUC
            best_param_set_index = np.random.choice(
                np.flatnonzero(
                    np.trapz(np.mean(episodes_steps, 1)) == np.trapz(np.mean(episodes_steps, 1)).min()))
Ejemplo n.º 30
0
        'beta_m': 0.9,
        'beta_v': 0.999,
        'epsilon': 1e-8
    },
    'replay_buffer_size': 50000,
    'minibatch_sz': 8,
    'num_replay_updates_per_step': 4,
    'gamma': 0.99,
    'tau': 0.001
}

current_env = LunarLanderEnvironment
current_agent = Agent

rlglue = RLGlue(current_env, current_agent)

env_info = {}
agent_info = agent_parameters

for run in range(1, experiment_parameters["num_runs"] + 1):
    agent_info["seed"] = run
    agent_info["network_config"]["seed"] = run
    env_info["seed"] = run

    rlglue.rl_init(agent_info, env_info)

    for episode in range(1, experiment_parameters["num_episodes"] + 1):
        rlglue.rl_episode(experiment_parameters["timeout"])
        episode_reward = rlglue.rl_agent_message("get_sum_reward")
        print("episode:", episode, " reward:", episode_reward)
Ejemplo n.º 31
0
from rl_glue import RLGlue
from windy_env import WindyEnvironment
from sarsa_agent import SarsaAgent
import numpy as np
import matplotlib.pyplot as plt

max_steps = 8000
steps = 0
episodes = 0

ep_list = []
step_list = []

environment = WindyEnvironment()
agent = SarsaAgent()
rl = RLGlue(environment, agent)
rl.rl_init()
while steps < max_steps:
    rl.rl_episode(max_steps)
    steps = rl.num_steps()
    episodes = rl.num_episodes()
    # print(steps, episodes)

    ep_list.append(episodes)
    step_list.append(steps)

plt.xlabel('Time steps')
plt.ylabel('Episodes')
plt.plot(step_list, ep_list)
plt.show()