Exemple #1
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    np.random.seed(0)
    num_episodes = 200
    see_eps = [157]
    num_runs = 1
    max_eps_steps = 100000

    # test with various stepsizes (alphas) for agent
    stepSizes = np.linspace(0.01, 1, 100)
    # best stepsize so far (comment out to test many)
    stepSizes = [0.559184]

    # seperate run for each stepsize
    for step in stepSizes:

        # initialize agent and software, with chosen stepsize
        rlglue.rl_init()
        rlglue.rl_agent_message('step:' + str(step))

        # keep track of total rewards for each episode
        total_rewards = []

        for ep in range(num_episodes):
            # render only selected episodes
            if ep in see_eps:
                rlglue.rl_env_message('rOFF')
            if ep + 1 in see_eps:
                rlglue.rl_env_message('rON')
                print("Episode %d" % (ep + 1))

            # initializse for episode
            rlglue.rl_start()
            terminal = False
            total_reward = 0

            # run episode and calculate total reward
            while not terminal:
                reward, state, action, terminal = rlglue.rl_step()
                total_reward += reward
            total_rewards.append(total_reward)

            # calculate average reward of the last 100 episodes
            if ep >= 99:
                total = np.sum(total_rewards[ep - 99:ep + 1])
                avg = total / 100

                # check if results indicate the problem is solved
                if avg > -110:
                    print("Solved at episode %d, avg reward: %f" %
                          (ep + 1, avg))
                    break

    # close environment
    environment.close()
Exemple #2
0
def run_experiment():

    #specify hyper-parameters
    num_runs = 1
    max_episodes = 1000000
    max_steps_per_episode = 100
    num_states = 181
    num_actions = 2
    alpha = 0.01
    eps = 0.1
    Q1 = 0

    results = np.zeros(max_episodes)
    results_run = 0

    agent = RandomAgent(num_states, num_actions, alpha, eps, Q1)
    environment = BlackJack()
    rlglue = RLGlue(environment, agent)

    print(
        "\nPrinting one dot for every run: {0} total runs to complete".format(
            num_runs))

    for run in range(num_runs):
        np.random.seed(run)
        results_run = 0.0

        rlglue.rl_init()
        for e in range(1, max_episodes + 1):
            rlglue.rl_start()
            for s in range(max_steps_per_episode):
                r, _, _, terminal = rlglue.rl_step()
                results_run += r
                results[e - 1] += r

                if terminal:
                    break

            if e % 10000 == 0:
                print(
                    "\nEpisode {}: average return till episode is {}, and policy is"
                    .format(e, results_run / e))
                print(rlglue.rl_agent_message("printPolicy"))
        print(".")

    print("Average return over experiment: {}".format(
        (results / num_runs).mean()))

    #save final policy to file -- change file name as necessary
    with open("policy.txt", 'w') as f:
        f.write(rlglue.rl_agent_message("printPolicy"))

    #save all the experiment data for analysis -- change file name as necessary
    save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
Exemple #3
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 50
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            #print("Episode number: "+str(e))
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            #print("Number of steps: "+str(steps))
            # print(steps[r, e])
    np.save('steps', steps)
    plotGraph()
    
    del agent, environment, rlglue
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])

    for r in range(num_runs):
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            print("Episode number: "+str(e))
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
            #print("Number of steps: "+str(steps))
            # print(steps[r, e])
    #np.save('steps', steps)
    #plotGraph()
    rlglue.rl_agent_message("plot3DGraph")
def question_3():
    # Specify hyper-parameters
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 1000000
    for _ in range(num_runs):
        rlglue.rl_init()
        i = 0
        for i in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            print(i)
    fout = open('value', 'w')
    steps = 50
    w, iht = rlglue.rl_agent_message("ValueFunction")
    Q = np.zeros([steps, steps])
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(3):
                value = 0
                for index in tiles(iht, 8, [
                        8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 *
                    (-0.07 + (j * 0.14 / steps)) / 0.14
                ], [a]):
                    value -= w[index]
                values.append(value)
            height = max(values)
            fout.write(repr(height) + ' ')
            Q[j][i] = height
        fout.write('\n')
    fout.close()
    np.save("value", Q)
def main():

    num_eps = 5000
    num_runs = 10
    random.seed(0)
    np.random.seed(0)
    agent = Agent()
    env = Environment()
    rlglue = RLGlue(env, agent)
    del agent, env
    for run in range(num_runs):
        rlglue.rl_init()
        performances = []
        for ep in range(num_eps):
            rlglue.rl_start()
            #rlglue.rl_env_message('renderON')
            terminal = False
            while not terminal:
                reward, state, action, terminal = rlglue.rl_step()

            # Find the first policy that performs at 100%
            performance = testPolicy(rlglue.rl_agent_message('policy')) * 100
            performances.append(performance)
            if performance >= 100:
                #print(rlglue.rl_agent_message('policy'))
                print('Episode: %d' % (ep + 1))
                break
        plt.plot(performances)
    plt.savefig('test.png')
Exemple #6
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    rl_glue = RLGlue(environment, agent)
        
    # save sum of reward at the end of each episode
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}

    agent_info = agent_parameters

    # one agent setting
    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        env_info["seed"] = run

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):
            # run episode
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward
    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')
def question_3():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    max_eps_steps = 100000
    num_episodes = 1000
    num_runs = 1
    numActions=3

    rlglue.rl_init()
    for e in range(num_episodes):
        rlglue.rl_episode(max_eps_steps)

    weights = rlglue.rl_agent_message("3D plot of the cast-to-go")

    fout = open('value','w')
    steps = 50
    z = np.zeros((50,50))
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(numActions):
                tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14]
                inds =  agent.get_index(tile,a)
                values.append(np.sum([weights[i] for i in inds]))
            height = max(values)
            z[j][i]=-height
            fout.write(repr(-height)+' ')
        fout.write('\n')
    fout.close()

    fig = plt.figure()
    ax = fig.add_subplot(111,projection ='3d')
    x = np.arange(-1.2,0.5,1.7/50)
    y = np.arange(-0.07,0.07,0.14/50)
    x,y = np.meshgrid(x,y)
    ax.set_xticks([-1.2, 0.5])
    ax.set_yticks([0.07, -0.07])
    ax.set_ylabel('Velocity')
    ax.set_xlabel('Position')
    ax.set_zlabel('Cost-To-Go')
    ax.plot_surface(x,y,z)
    plt.savefig('cost-to-go.png')
    plt.show()
    np.save('steps', steps)
def question_3():

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])
    # only 1 run
    for r in range(num_runs):
        print("1000 episode run : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
        # get the list of value functions [X,Y,Z] represents position, velocity, state-value
        Return = rlglue.rl_agent_message(1)
    return Return
Exemple #9
0
def question_2():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    max_eps_steps = 100000
    num_episodes = 1000

    rlglue.rl_init()
    for _ in tqdm(range(num_episodes)):
        rlglue.rl_episode(max_eps_steps)

    q3_plot = rlglue.rl_agent_message("plot")

    fig = plt.figure()
    ax = fig.gca(projection='3d')
    X, Y = np.meshgrid(q3_plot[0], q3_plot[1])
    surf = ax.plot_surface(X, Y, q3_plot[2])
    ax.set_xlim(q3_plot[0][0], q3_plot[0][-1])
    ax.set_ylim(q3_plot[1][0], q3_plot[1][-1])
    plt.show()
Exemple #10
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    num_action = 3

    for r in range(num_runs):
        print("run number : ", r)
        #np.random.seed(r)
        rlglue.rl_init()
        for _ in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
        weight = rlglue.rl_agent_message('get weight')

    # algorithm from assignment
    #fout = open('value','w')
    steps = 50
    neg_q_hat = np.zeros((steps, steps))
    for i in range(steps):
        for j in range(steps):
            values = []
            position = -1.2 + (i * 1.7 / steps)
            velocity = -0.07 + (j * 0.14 / steps)
            for a in range(num_action):
                tile_idx = agent.plot_get_feature(position, velocity, a)
                q_hat = np.sum(weight[tile_idx])
                values.append(q_hat)
            height = np.max(values)
            neg_q_hat[j][i] = -height
            #fout.write(repr(-height)+' ')
        #fout.write('\n')
    #fout.close()
    np.save('neg_q_hat', neg_q_hat)
Exemple #11
0
def question_3():
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    for r in range(num_runs):
        start = time.time()
        print("run number : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
    end = time.time()
    print(str(end - start) + " seconds elapsed")
    action_vals, pos, vel = rlglue.rl_agent_message("return info")
    action_vals = np.multiply(action_vals, -1)
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_surface(pos, vel, action_vals)
    plt.show()
Exemple #12
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)
        
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}
    agent_info = agent_parameters

    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        agent_info["network_pickle"] = "network500.pickle"
        env_info["seed"] = run
        env_info["render"] = True

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward
Exemple #13
0
from rl_glue import RLGlue
from mountain_car_env import MountainCarEnvironment
from sarsa_agent import SarsaAgent

num_runs = 10
num_episodes = 300
env_info = {"num_tiles": 8, "num_tilings": 8}
agent_info = {}
all_steps = []

agent = SarsaAgent
env = MountainCarEnvironment

for run in range(num_runs):
    rl_glue = RLGlue(env, agent)
    rl_glue.rl_init(agent_info, env_info)

    for episode in range(num_episodes + 1):
        rl_glue.rl_episode(15000)
        r = rl_glue.rl_agent_message("get_reward")
        print("episode:", episode, "reward:", r)
Exemple #14
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for actor_ss in agent_parameters["actor_step_size"]:
                for critic_ss in agent_parameters["critic_step_size"]:
                    for avg_reward_ss in agent_parameters[
                            "avg_reward_step_size"]:

                        env_info = {}
                        agent_info = {
                            "num_tilings": num_tilings,
                            "num_tiles": num_tiles,
                            "actor_step_size": actor_ss,
                            "critic_step_size": critic_ss,
                            "avg_reward_step_size": avg_reward_ss,
                            "num_actions": agent_parameters["num_actions"],
                            "iht_size": agent_parameters["iht_size"]
                        }

                        # results to save
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))

                        # using tqdm we visualize progress bars
                        for run in tqdm(
                                range(1,
                                      experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run

                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()

                            num_steps = 0
                            total_return = 0.
                            return_arr = []

                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0

                            while num_steps < experiment_parameters[
                                    'max_steps']:
                                num_steps += 1

                                rl_step_result = rl_glue.rl_step()

                                reward = rl_step_result[0]
                                total_return += reward
                                return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message(
                                    "get avg reward")

                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                    1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward -
                                                        exp_avg_reward)

                                return_per_step[run - 1][num_steps -
                                                         1] = total_return
                                exp_avg_reward_per_step[run -
                                                        1][num_steps -
                                                           1] = exp_avg_reward

                        if not os.path.exists('results'):
                            os.makedirs('results')

                        save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format(
                            num_tilings, num_tiles, actor_ss, critic_ss,
                            avg_reward_ss)
                        total_return_filename = "results/{}_total_return.npy".format(
                            save_name)
                        exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format(
                            save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename,
                                exp_avg_reward_per_step)
Exemple #15
0
        'beta_m': 0.9,
        'beta_v': 0.999,
        'epsilon': 1e-8
    },
    'replay_buffer_size': 50000,
    'minibatch_sz': 8,
    'num_replay_updates_per_step': 4,
    'gamma': 0.99,
    'tau': 0.001
}

current_env = LunarLanderEnvironment
current_agent = Agent

rlglue = RLGlue(current_env, current_agent)

env_info = {}
agent_info = agent_parameters

for run in range(1, experiment_parameters["num_runs"] + 1):
    agent_info["seed"] = run
    agent_info["network_config"]["seed"] = run
    env_info["seed"] = run

    rlglue.rl_init(agent_info, env_info)

    for episode in range(1, experiment_parameters["num_episodes"] + 1):
        rlglue.rl_episode(experiment_parameters["timeout"])
        episode_reward = rlglue.rl_agent_message("get_sum_reward")
        print("episode:", episode, " reward:", episode_reward)
Exemple #16
0
from rl_glue import RLGlue
from sarsa_agent import sarsaAgent
from windygridworld_env import windyGridenv
import matplotlib.pyplot as plt

if __name__ == "__main__":
    max_steps = 8000
    num_runs = 10

    for n in range(4):
        # Create and pass agent and environment objects to RLGlue
        environment = windyGridenv()
        agent = sarsaAgent()
        rlglue = RLGlue(environment, agent)
        if n == 0:
            rlglue.rl_agent_message("alpha = 0.3")
            message = "alpha = 0.3"
        elif n == 1:
            rlglue.rl_agent_message("alpha = 0.5")
            message = "alpha = 0.5"
        elif n == 2:
            rlglue.rl_agent_message("alpha = 0.7")
            message = "alpha = 0.7"
        else:
            rlglue.rl_agent_message("alpha = 0.9")
            message = "alpha = 0.9"
        rlglue.rl_agent_message("epsilon = 0.1")
        rlglue.rl_agent_message("4")
        del agent, environment  # don't use these anymore

        time_steps = []
Exemple #17
0
    # np.random.seed(count)

    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            Q = rlglue.rl_agent_message('Q')
            print(Q)
            done = True

    V, state, env_map = learn_step()
    draw_step(V, state, env_map)

    pygame.display.flip()


if __name__ == "__main__":
    # main()
    rlglue.rl_init()
    while not done:
        run()
        count += 1

        if time_step % 100 == 1:
            count_episode = rlglue.rl_agent_message('COUNT')
            print('time_step: {:d}, count: {:d}'.format(
                time_step, count_episode))

        time_step += 1
        # print(count)
        # sleep(0.5)
Exemple #18
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # save sum of reward at the end of each episode
    agent_sum_reward = []
    average_sum_reward = []

    env_info = environment_parameters
    agent_info = agent_parameters

    rl_glue.rl_init(agent_info, env_info)

    starting_episode = 0

    gym_name = env_info['gym_environment']
    agent_name = agent_info['name']
    save_name = "{}.npy".format(rl_glue.agent.name)
    npy_path = os.path.join(rl_glue.agent.checkpoint_dir,
                            "sum_reward_{}".format(save_name))
    fig_path = os.path.join(rl_glue.agent.checkpoint_dir, 'sum_rewards.png')

    # load checkpoint if any
    if experiment_parameters['load_checkpoint'] is not None:
        rl_glue.agent.load_checkpoint(experiment_parameters['load_checkpoint'])
        agent_sum_reward, average_sum_reward = np.load(npy_path)
        agent_sum_reward = list(agent_sum_reward)
        average_sum_reward = list(average_sum_reward)
        fname = experiment_parameters['load_checkpoint'].split(os.path.sep)[-1]
        try:
            starting_episode = int(fname.split('_')[1])
        except IndexError:
            starting_episode = len(agent_sum_reward)

        print(f"starting from episode {starting_episode}")

    for episode in tqdm(
            range(1 + starting_episode,
                  experiment_parameters["num_episodes"] + 1)):
        # run episode
        rl_glue.rl_episode(experiment_parameters["timeout"])

        episode_reward = rl_glue.rl_agent_message("get_sum_reward")
        agent_sum_reward.append(episode_reward)
        if episode % experiment_parameters['print_freq'] == 0:
            print('Episode {}/{} | Reward {}'.format(
                episode, experiment_parameters['num_episodes'],
                episode_reward))

        average = get_average(agent_sum_reward)
        average_sum_reward.append(average)

        if episode % experiment_parameters['checkpoint_freq'] == 0:
            rl_glue.agent.save_checkpoint(episode)
            savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path,
                    agent_name, gym_name)

        if env_info['solved_threshold'] is not None and average >= env_info[
                'solved_threshold']:
            print("Task Solved with reward = {}".format(episode_reward))
            rl_glue.agent.save_checkpoint(episode, solved=True)
            break

    savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path,
            agent_name, gym_name)
Exemple #19
0
        np.save("ground_truth.npy", truth)

    num_episodes = 100000
    v = np.zeros(1000)

    environment = Environment()
    agent = Gradient_MC()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    rlglue.rl_init()

    for episode in tqdm(range(num_episodes)):
        rlglue.rl_episode()

    aggregated_v = rlglue.rl_agent_message("ValueFunction")
    distribution = rlglue.rl_agent_message("distribution")

    for i in range(1000):
        v[i] = aggregated_v[i // (1000 // aggregated_v.shape[0])]

    x = np.arange(1000)
    # plt.plot(x, truth)
    # plt.plot(x, v)
    # plt.show()

    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()

    ax1.set_xlabel('State')
    ax1.set_ylabel('Value Scale')
            print("run number: {}\n".format(run))

            # set seed for reproducibility
            np.random.seed(run)

            # initialize RL-Glue
            rlglue.rl_init()

            # loop over episodes
            for episode in range(num_episodes):
                #print("episode{}".format(episode))
                # run episode with the allocated steps budget
                rlglue.rl_episode()
                if episode % 10 == 0:
                    V_hat = rlglue.rl_agent_message("Estimate value function")
                    #print(V_hat)
                    #vhat_arr.append(V_hat)
                    # reference: https://stackoverflow.com/questions/21926020/how-to-calculate-rmse-using-ipython-numpy
                    RMSE = np.sqrt(np.mean((Vs - V_hat)**2))
                    #print(RMSE)
                    result[int(episode / 10)] += RMSE
        result = result / num_runs
        output.append(result)
        print('total time of executing 30 rums with {} agent is {:3}s'.format(
            item,
            time.time() - start_time))
        print(result)
    #np.savez('randomwalk.npz',tabular = output[0],tile_coding = output[1] )
    np.save('randomwalk', output)
Exemple #21
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    agent = Agent(env.get_actions())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Configure experiment
    num_eps = 100000
    # initialize rlglue
    rlglue.rl_init()

    avg_rewards = []
    avg_reward = 0
    max_reward = 0
    best_policy = None
    # Run through each episode
    #rlglue.rl_env_message('renderON')
    #for ep in range(num_eps):
    ep = 0
    while ep < num_eps:
        ep += 1
        #if ep % int(num_eps/10) == 0:
        #print('ep:', ep, 'bestpolicy', max_reward)
        # start episode
        rlglue.rl_start()
        rewards = 0
        steps = 1
        # Run episode to its completion
        terminal = False
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
            rewards += reward
            steps += 1

        avg_reward = rewards
        avg_rewards.append(avg_reward)

        if rewards > max_reward:
            max_reward = rewards
            best_policy = rlglue.rl_agent_message('policy')
            pickle.dump(best_policy, open("policy.pickle", "wb"))
            print('ep', ep, 'reward', avg_reward)
        #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps)
        #print(rlglue.rl_agent_message('policy'))
        #input()

    plt.plot(avg_rewards)
    plt.plot(moving_average(avg_rewards, 10))
    plt.plot(moving_average(avg_rewards, 100))
    plt.savefig('results.png')

    # Get generated policy
    policy = rlglue.rl_agent_message('policy')

    # Test policy
    result = testPolicy(best_policy)
from grid_env import GridEnvironment
from dynaq_agent import DynaqAgent
import numpy as np
import time
import matplotlib.pyplot as plt

if __name__ == "__main__":
    start_time = time.time()
    max_steps = 8000
    num_runs  = 10
    num_episodes = 50
    # Create and pass agent and environment objects to RLGlue
    environment =GridEnvironment()
    agent = DynaqAgent()
    rlglue = RLGlue(environment, agent)
    rlglue.rl_agent_message('n = 0')
    del agent, environment  # don't use these anymore
    steps1 = {}
    L1 = []
    L2 = [0]*num_episodes
    for episode in range(num_episodes):
      steps1[episode]=[]
      L1.append(episode+1)
    for run in range(num_runs):
      np.random.seed(run)
      rlglue.rl_init()
      step = 0
      for episode in range(num_episodes):
        rlglue.rl_episode(max_steps)
        new_step = rlglue.num_steps()
        steps1[episode].append(new_step-step)
Exemple #23
0
        # set seed for reproducibility
        np.random.seed(run)

        # initialize RL-Glue
        rlglue.rl_init()

        # loop over episodes
        for episode in range(num_episodes):
            print("episode{}".format(episode))
            # run episode with the allocated steps budget
            rlglue.rl_episode(max_steps)

            # if episode is one of the key episodes, extract and save value
            # function
            if episode in key_episodes:
                V = np.fromstring(rlglue.rl_agent_message('ValueFunction'),dtype='float')
                v_over_runs[episode].append(V)
    print('total time of executing 10 rums is {:3}s'.format(time.time()-start_time))
    # extract length of key_episodes
    n_valueFunc = len(key_episodes)

    # extract number of states via length of a particular value function
    n = v_over_runs[key_episodes[0]][0].shape[0]

    # initialize data structure for average value function at key_episodes
    average_v_over_runs = np.zeros((n_valueFunc,n))

    # average across runs at various episodes, to estimate average value
    # function at episode
    for i, episode in enumerate(key_episodes):
        # each item in v_over_runs[episode] is a list (one item per run),
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    """
    Assume environment_parameters dict contains:
    {
        input_dim: integer,
        num_actions: integer,
        discount_factor: float
    }
    
    Assume agent_parameters dict contains:
    {
        step_size: 1D numpy array of floats,
        tau: 1D numpy array of floats
    }
    
    Assume experiment_parameters dict contains:
    {
        num_runs: integer,
        num_episodes: integer
    }    
    """
    
    ### Instantiate rl_glue from RLGlue    
    rl_glue = RLGlue(environment, agent)

    os.system('sleep 1') # to prevent tqdm printing out-of-order
        
     ### Initialize agent_sum_reward to zero in the form of a numpy array 
    # with shape (number of values for tau, number of step-sizes, number of runs, number of episodes)
    agent_sum_reward = np.zeros((len(agent_parameters["tau"]), len(agent_parameters["step_size"]), experiment_parameters["num_runs"], experiment_parameters["num_episodes"]))
    
    # for loop over different values of tau
    # tqdm is used to show a progress bar for completing the parameter study
    for i in tqdm(range(len(agent_parameters["tau"]))):
    
        # for loop over different values of the step-size
        for j in range(len(agent_parameters["step_size"])): 

            ### Specify env_info 
            env_info = {}

            ### Specify agent_info
            agent_info = {"num_actions": environment_parameters["num_actions"],
                          "input_dim": environment_parameters["input_dim"],
                          "discount_factor": environment_parameters["discount_factor"],
                          "tau": agent_parameters["tau"][i],
                          "step_size": agent_parameters["step_size"][j]}

            # for loop over runs
            for run in range(experiment_parameters["num_runs"]): 
                
                # Set the seed
                agent_info["seed"] = agent_parameters["seed"] * experiment_parameters["num_runs"] + run
                
                # Beginning of the run            
                rl_glue.rl_init(agent_info, env_info)

                for episode in range(experiment_parameters["num_episodes"]): 
                    
                    # Run episode
                    rl_glue.rl_episode(0) # no step limit

                    ### Store sum of reward
                    agent_sum_reward[i, j, run, episode] = rl_glue.rl_agent_message("get_sum_reward")

            if not os.path.exists('results'):
                    os.makedirs('results')

            save_name = "{}".format(rl_glue.agent.name).replace('.','')

            # save sum reward
            np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for update_ss in agent_parameters["update_step_size"]:
                for avg_reward_ss in agent_parameters["avg_reward_step_size"]:
                    for epsilon in agent_parameters["epsilon"]:
                        env_info = {}
                        agent_info = {"num_tilings": num_tilings,
                                      "num_tiles": num_tiles,
                                      "alpha": update_ss,
                                      "avg_reward_step_size": avg_reward_ss,
                                      "epsilon":epsilon,
                                      "num_actions": agent_parameters["num_actions"],
                                      "iht_size": agent_parameters["iht_size"]}
                        # results to save
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"], experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                                (experiment_parameters["num_runs"], experiment_parameters["max_steps"]))
                        # using tqdm we visualize progress bars
                        avg_reward_list = []
                        avg_reward = -10000
                        for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run
                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()
                            num_steps = 0
                            total_return = 0.
                            #return_arr = []
                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0
                            while num_steps < experiment_parameters['max_steps']:
                                num_steps += 1
                                rl_step_result = rl_glue.rl_step()
                                reward = rl_step_result[0]
                                total_return += reward
                                #return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message("get avg reward")
                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                                1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward - exp_avg_reward)

                                return_per_step[run - 1][num_steps - 1] = total_return
                                exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward
                            avg_reward_list.append(avg_reward)
                        print(np.average(avg_reward_list))
                        if not os.path.exists('results_sarsa'):
                            os.makedirs('results_sarsa')

                        save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format(
                            num_tilings, num_tiles, update_ss, epsilon, avg_reward_ss, experiment_parameters["max_steps"])
                        total_return_filename = "results_sarsa/{}_total_return.npy".format(save_name)
                        exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format(save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
Exemple #26
0
    else:
        truth = ground_truth()
        np.save("ground_truth.npy", truth)

    num_episodes = 2000
    num_runs = 30

    rmse_tabular = np.zeros(num_episodes // 10)
    rmse_tile = np.zeros(num_episodes // 10)

    environment = Environment()
    agent = TD_0()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    rlglue.rl_agent_message("tabular")
    for run in tqdm(range(num_runs)):
        np.random.seed(run)
        rlglue.rl_init()
        for episode in range(num_episodes):
            rlglue.rl_episode()

            if episode % 10 == 0:
                v = rlglue.rl_agent_message("ValueFunction")
                rmse_tabular[episode // 10] += np.sqrt(np.mean((truth - v)**2))

    rlglue.rl_agent_message("tile")
    for run in tqdm(range(num_runs)):
        np.random.seed(run)
        rlglue.rl_init()
        for episode in range(num_episodes):
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # Sweep Agent parameters
    for num_agg_states in agent_parameters["num_groups"]:
        for step_size in agent_parameters["step_size"]:

            # save rmsve at the end of each evaluation episode
            # size: num_episode / episode_eval_frequency + 1 (includes evaluation at the beginning of training)
            agent_rmsve = np.zeros(
                int(experiment_parameters["num_episodes"] /
                    experiment_parameters["episode_eval_frequency"]) + 1)

            # save learned state value at the end of each run
            agent_state_val = np.zeros(environment_parameters["num_states"])

            env_info = {
                "num_states":
                environment_parameters["num_states"],
                "start_state":
                environment_parameters["start_state"],
                "left_terminal_state":
                environment_parameters["left_terminal_state"],
                "right_terminal_state":
                environment_parameters["right_terminal_state"]
            }

            agent_info = {
                "num_states": environment_parameters["num_states"],
                "num_groups": num_agg_states,
                "step_size": step_size,
                "discount_factor": environment_parameters["discount_factor"]
            }

            print('Setting - num. agg. states: {}, step_size: {}'.format(
                num_agg_states, step_size))
            os.system('sleep 0.2')

            # one agent setting
            for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)):
                env_info["seed"] = run
                agent_info["seed"] = run
                rl_glue.rl_init(agent_info, env_info)

                # Compute initial RMSVE before training
                current_V = rl_glue.rl_agent_message("get state value")
                agent_rmsve[0] += calc_RMSVE(current_V)

                for episode in range(1, experiment_parameters["num_episodes"] +
                                     1):
                    # run episode
                    rl_glue.rl_episode(0)  # no step limit

                    if episode % experiment_parameters[
                            "episode_eval_frequency"] == 0:
                        current_V = rl_glue.rl_agent_message("get state value")
                        agent_rmsve[int(
                            episode /
                            experiment_parameters["episode_eval_frequency"]
                        )] += calc_RMSVE(current_V)

                # store only one run of state value
                if run == 50:
                    agent_state_val = rl_glue.rl_agent_message(
                        "get state value")

            # rmsve averaged over runs
            agent_rmsve /= experiment_parameters["num_runs"]

            save_name = "{}_agg_states_{}_step_size_{}".format(
                'TD_agent', num_agg_states, step_size).replace('.', '')

            if not os.path.exists('results'):
                os.makedirs('results')

            # save avg. state value
            np.save("results/V_{}".format(save_name), agent_state_val)

            # save avg. rmsve
            np.save("results/RMSVE_{}".format(save_name), agent_rmsve)
Exemple #28
0
from rl_glue import RLGlue
from windy_env import WindyEnvironment
from n_step_sarsa_agent import SarsaAgent
import numpy as np
import time
import matplotlib.pyplot as plt

if __name__ == "__main__":
    start_time = time.time()
    max_steps = 8000

    # Create and pass agent and environment objects to RLGlue
    environment = WindyEnvironment()
    agent = SarsaAgent()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore
    rlglue.rl_init()
    L1 = []
    L2 = []
    n = rlglue.rl_agent_message('n')
    a = rlglue.rl_agent_message('a')
    while rlglue.num_steps() < max_steps:
        L1.append(rlglue.num_steps())
        rlglue.rl_episode(10000)
        episodes = rlglue.num_episodes()
        L2.append(episodes)
    plt.title(str(n) + '-step sarsa with ' + str(a) + " actions")
    plt.plot(L1, L2)
    plt.show()
        print("training process with {} planning step".format(ite))
        # Create and pass agent and environment objects to RLGlue
        environment = DynaQEnvironment()
        agent = DynaQAgent(ite)
        rlglue = RLGlue(environment, agent)
        del agent, environment  # don't use these anymore

        for run in range(num_runs):
            print("run number: {}\n".format(run))
            # set seed for reproducibility
            np.random.seed(run)

            # initialize RL-Glue
            rlglue.rl_init()

            # loop over episodes
            for episode in range(num_episodes):

                rlglue.rl_episode()

                result[episode] += rlglue.num_ep_steps()
                data = rlglue.rl_agent_message(
                    "Q for all states in the episode")

                Q.append(data)

        result = result / num_runs
        output.append(result)

    np.save("output", output)
Exemple #30
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # save rmsve at the end of each episode
    agent_rmsve = np.zeros(
        (experiment_parameters["num_runs"],
         int(experiment_parameters["num_episodes"] /
             experiment_parameters["episode_eval_frequency"]) + 1))

    # save learned state value at the end of each run
    agent_state_val = np.zeros((experiment_parameters["num_runs"],
                                environment_parameters["num_states"]))

    env_info = {
        "num_states": environment_parameters["num_states"],
        "start_state": environment_parameters["start_state"],
        "left_terminal_state": environment_parameters["left_terminal_state"],
        "right_terminal_state": environment_parameters["right_terminal_state"]
    }

    agent_info = {
        "num_states": environment_parameters["num_states"],
        "num_hidden_layer": agent_parameters["num_hidden_layer"],
        "num_hidden_units": agent_parameters["num_hidden_units"],
        "step_size": agent_parameters["step_size"],
        "discount_factor": environment_parameters["discount_factor"],
        "beta_m": agent_parameters["beta_m"],
        "beta_v": agent_parameters["beta_v"],
        "epsilon": agent_parameters["epsilon"]
    }

    print('Setting - Neural Network with 100 hidden units')
    os.system('sleep 1')

    # one agent setting
    for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)):
        env_info["seed"] = run
        agent_info["seed"] = run
        rl_glue.rl_init(agent_info, env_info)

        # Compute initial RMSVE before training
        current_V = rl_glue.rl_agent_message("get state value")
        agent_rmsve[run - 1, 0] = calc_RMSVE(current_V)

        for episode in range(1, experiment_parameters["num_episodes"] + 1):
            # run episode
            rl_glue.rl_episode(0)  # no step limit

            if episode % experiment_parameters["episode_eval_frequency"] == 0:
                current_V = rl_glue.rl_agent_message("get state value")
                agent_rmsve[run - 1,
                            int(episode /
                                experiment_parameters["episode_eval_frequency"]
                                )] = calc_RMSVE(current_V)
            elif episode == experiment_parameters[
                    "num_episodes"]:  # if last episode
                current_V = rl_glue.rl_agent_message("get state value")

        agent_state_val[run - 1, :] = current_V

    save_name = "{}".format(rl_glue.agent.name).replace('.', '')

    if not os.path.exists('results'):
        os.makedirs('results')

    # save avg. state value
    np.save("results/V_{}".format(save_name), agent_state_val)

    # save avg. rmsve
    np.savez("results/RMSVE_{}".format(save_name),
             rmsve=agent_rmsve,
             eval_freq=experiment_parameters["episode_eval_frequency"],
             num_episodes=experiment_parameters["num_episodes"])