def question_4(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 1000000 steps = np.zeros([num_runs, num_episodes]) rewards = [] for r in range(num_runs): print("run number : ", r + 1) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() reward = rlglue.total_reward() rewards.append(reward) mean = sum(rewards) / len(rewards) stder = statistics.stdev(rewards) / math.sqrt(len(rewards)) print("mean:", mean) print("std:", stder) np.save('bonus_steps', steps) np.save("mean", mean) np.save("stder", stder)
def better_run_experiment(num_runs, num_episodes): # Same as last function all_steps = [] agent = ag.ExpectedSarsaAgent env = enviro.MountainEnvironment total_reward_per_run = [] for run in range(num_runs): start = time.time() if run % 5 == 0: print("RUN: {}".format(run)) initial_weights = np.random.uniform(-0.001, 0) agent_info = { "num_tilings": 32, "num_tiles": 4, "iht_size": 4096, "epsilon": 0.1, "gamma": 1, "alpha": 0.7 / 32, "initial_weights": initial_weights, "num_actions": 3 } env_info = { "min_position": -1.2, "max_position": 0.5, "min_velocity": -0.07, "max_velocity": 0.07, "gravity": 0.0025, "action_discount": 0.001 } rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) steps_per_episode = [] for episode in range(num_episodes): # 15000 is max steps of the episode rl_glue.rl_episode(15000) steps_per_episode.append(rl_glue.num_steps) total_reward = np.sum(steps_per_episode) * -1 all_steps.append(np.array(steps_per_episode)) print("Run time: {}".format(time.time() - start)) total_reward_per_run.append(total_reward) data = np.mean(total_reward_per_run) data_std_err = np.std(total_reward_per_run, axis=0) / np.sqrt(num_runs) plt.title("Expected Sarsa MountainCar (Alternate Parameters)", fontdict={ 'fontsize': 16, 'fontweight': 25 }, pad=15.0) plt.xlabel("Epsiode", labelpad=5.0) plt.ylabel("Steps per Episode (averaged over " + str(num_runs) + " runs)", labelpad=10.0) plt.plot(np.mean(np.array(all_steps), axis=0)) plt.show() np.save("ExpectedSarsa_test", np.array(all_steps)) print("mean: ", data) print("standard error: ", data_std_err)
def run_experiment(env_info, agent_info, num_episodes=5000, experiment_name=None, plot_freq=100, true_values_file=None, value_error_threshold=1e-8): env = CliffWalkEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) manager = Manager(env_info, agent_info, true_values_file=true_values_file, experiment_name=experiment_name) for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") manager.visualize(values, episode) values = rl_glue.agent.agent_message("get_values") if true_values_file is not None: # Grading: The Manager will check that the values computed using your TD agent match # the true values (within some small allowance) across the states. In addition, it also # checks whether the root mean squared value error is close to 0. manager.run_tests(values, value_error_threshold) return values
def run_experiment(env_info, agent_info, num_episodes=5000, experiment_name=None, plot_freq=100, true_values_file=None, value_error_threshold=1e-8): env = CliffWalkEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) manager = Manager(env_info, agent_info, true_values_file=true_values_file, experiment_name=experiment_name) for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") manager.visualize(values, episode) values = rl_glue.agent.agent_message("get_values") return values
def question_3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 1000000 for _ in range(num_runs): rlglue.rl_init() i = 0 for i in range(num_episodes): rlglue.rl_episode(max_eps_steps) print(i) fout = open('value', 'w') steps = 50 w, iht = rlglue.rl_agent_message("ValueFunction") Q = np.zeros([steps, steps]) for i in range(steps): for j in range(steps): values = [] for a in range(3): value = 0 for index in tiles(iht, 8, [ 8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 * (-0.07 + (j * 0.14 / steps)) / 0.14 ], [a]): value -= w[index] values.append(value) height = max(values) fout.write(repr(height) + ' ') Q[j][i] = height fout.write('\n') fout.close() np.save("value", Q)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = np.zeros((experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) env_info = {} agent_info = agent_parameters # one agent setting for run in range(1, experiment_parameters["num_runs"]+1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run env_info["seed"] = run rl_glue.rl_init(agent_info, env_info) for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)): # run episode rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward[run - 1, episode - 1] = episode_reward save_name = "{}".format(rl_glue.agent.name) if not os.path.exists('results'): os.makedirs('results') np.save("results/sum_reward_{}".format(save_name), agent_sum_reward) shutil.make_archive('results', 'zip', 'results')
def foreground(servos, time_steps, episodes, plotting_data): my_agent = DanceBot(plotting_data) my_env = ServoEnvironment(servos) my_rl_glue = RLGlue(my_env, my_agent) print("\nRunning {} episodes with {} time-steps each.".format(episodes, time_steps)) for _ in range(episodes): my_rl_glue.rl_episode(time_steps)
def foreground(servos, time_steps, episodes, plotting_data): my_agent = DanceBot(plotting_data) my_env = ServoEnvironment(servos) my_rl_glue = RLGlue(my_env, my_agent) print("\nRunning {} episodes with {} time-steps each.".format( episodes, time_steps)) for _ in range(episodes): my_rl_glue.rl_episode(time_steps)
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): #print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) np.save('steps', steps) plotGraph() del agent, environment, rlglue agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) #np.save('steps', steps) #plotGraph() rlglue.rl_agent_message("plot3DGraph")
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 num_runs = 1 numActions=3 rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) weights = rlglue.rl_agent_message("3D plot of the cast-to-go") fout = open('value','w') steps = 50 z = np.zeros((50,50)) for i in range(steps): for j in range(steps): values = [] for a in range(numActions): tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14] inds = agent.get_index(tile,a) values.append(np.sum([weights[i] for i in inds])) height = max(values) z[j][i]=-height fout.write(repr(-height)+' ') fout.write('\n') fout.close() fig = plt.figure() ax = fig.add_subplot(111,projection ='3d') x = np.arange(-1.2,0.5,1.7/50) y = np.arange(-0.07,0.07,0.14/50) x,y = np.meshgrid(x,y) ax.set_xticks([-1.2, 0.5]) ax.set_yticks([0.07, -0.07]) ax.set_ylabel('Velocity') ax.set_xlabel('Position') ax.set_zlabel('Cost-To-Go') ax.plot_surface(x,y,z) plt.savefig('cost-to-go.png') plt.show() np.save('steps', steps)
def tiling(real_value): environment = RandomWalkEnvironment() agent = Agent2() rl = RLGlue(environment, agent) error = np.zeros(5000) for run in range(30): rl.rl_init() for episode in range(2000): rl.rl_episode(10000) estimate = rl.RL_agent_message("ValueFunction") error[episode] += np.sqrt( np.mean(np.power(real_value - estimate, 2))) rl.RL_cleanup() file2 = open("tiling_output.txt", "w") for i in range(2000): file2.write(format(error[i] / 10)) file2.close()
def question_1(num_episodes): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 steps = np.zeros(num_episodes) rlglue.rl_init() for e in tqdm(range(num_episodes)): rlglue.rl_episode(max_eps_steps) steps[e] = rlglue.num_ep_steps() # print(steps[e]) return steps
def question_1(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 5 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # print(steps[r, e]) np.save('steps', steps)
def run_experiment(env_info, agent_info, num_episodes=5000, value_error_threshold=1e-8, plot_freq=10): env = GridEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) steps = [] for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit steps.append(rl_glue.agent.agent_message("get_steps")) if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") print(rl_glue.environment.env_message("get_grid_state")) rl_glue.rl_cleanup() values = rl_glue.agent.agent_message("get_values") return [values, steps]
def question_2(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 rlglue.rl_init() for _ in tqdm(range(num_episodes)): rlglue.rl_episode(max_eps_steps) q3_plot = rlglue.rl_agent_message("plot") fig = plt.figure() ax = fig.gca(projection='3d') X, Y = np.meshgrid(q3_plot[0], q3_plot[1]) surf = ax.plot_surface(X, Y, q3_plot[2]) ax.set_xlim(q3_plot[0][0], q3_plot[0][-1]) ax.set_ylim(q3_plot[1][0], q3_plot[1][-1]) plt.show()
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) # only 1 run for r in range(num_runs): print("1000 episode run : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # get the list of value functions [X,Y,Z] represents position, velocity, state-value Return = rlglue.rl_agent_message(1) return Return
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 num_action = 3 for r in range(num_runs): print("run number : ", r) #np.random.seed(r) rlglue.rl_init() for _ in range(num_episodes): rlglue.rl_episode(max_eps_steps) weight = rlglue.rl_agent_message('get weight') # algorithm from assignment #fout = open('value','w') steps = 50 neg_q_hat = np.zeros((steps, steps)) for i in range(steps): for j in range(steps): values = [] position = -1.2 + (i * 1.7 / steps) velocity = -0.07 + (j * 0.14 / steps) for a in range(num_action): tile_idx = agent.plot_get_feature(position, velocity, a) q_hat = np.sum(weight[tile_idx]) values.append(q_hat) height = np.max(values) neg_q_hat[j][i] = -height #fout.write(repr(-height)+' ') #fout.write('\n') #fout.close() np.save('neg_q_hat', neg_q_hat)
def part3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # print(steps[r, e]) np.save('steps', steps) fout = open('value', 'w') steps = 50 num_of_actions = 3 for i in range(steps): for j in range(steps): q = [] for a in range(num_of_actions): pos = -1.2 + (1 * 1.7 / steps) vel = -0.07 + (j * 0.14 / steps) tile = (pos, vel) inds = Agent.F(tile, self.action) q.append(np.sum(self.weights[inds])) height = max(q) fout.write(repr(-height) + '') fout.write('\n') fout.close() np.save('heights', height) np.save('steps', steps)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) agent_sum_reward = np.zeros((experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) env_info = {} agent_info = agent_parameters for run in range(1, experiment_parameters["num_runs"]+1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run agent_info["network_pickle"] = "network500.pickle" env_info["seed"] = run env_info["render"] = True rl_glue.rl_init(agent_info, env_info) for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)): rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward[run - 1, episode - 1] = episode_reward
def question_3(): num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) for r in range(num_runs): start = time.time() print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) end = time.time() print(str(end - start) + " seconds elapsed") action_vals, pos, vel = rlglue.rl_agent_message("return info") action_vals = np.multiply(action_vals, -1) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(pos, vel, action_vals) plt.show()
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) st = time.time() rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # print(steps[r, e]) finish = time.time() - st print(str(finish) + " seconds elapsed") np.save('steps', steps)
from rl_glue import RLGlue from mountain_car_env import MountainCarEnvironment from sarsa_agent import SarsaAgent num_runs = 10 num_episodes = 300 env_info = {"num_tiles": 8, "num_tilings": 8} agent_info = {} all_steps = [] agent = SarsaAgent env = MountainCarEnvironment for run in range(num_runs): rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) for episode in range(num_episodes + 1): rl_glue.rl_episode(15000) r = rl_glue.rl_agent_message("get_reward") print("episode:", episode, "reward:", r)
v_over_runs[episode] = [] for run in range(num_runs): # set seed for reproducibility np.random.seed(run) # initialize RL-Glue rlglue.rl_init() total_steps = [] if run == 0: rlglue._agent.set_num_of_actions(8) print("running sarsa agent with 8 actions...") else: rlglue._agent.set_num_of_actions(9) print("running sarsa agent with 9 actions...") # loop over episodes for episode in range(num_episodes): # run episode with the allocated steps budget rlglue.rl_episode(max_steps) total_steps.append(rlglue._environment.get_total_steps()) # plot p1, = plt.plot(total_steps, range(1, 171)) plt.ylabel('Episodes') plt.xlabel('Time Steps') if run == 0: plt.title('Performace of Sarsa Agent with 8 Actions') else: plt.title('Performace of Sarsa Agent with 9 Actions') plt.show()
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # Sweep Agent parameters for num_agg_states in agent_parameters["num_groups"]: for step_size in agent_parameters["step_size"]: # save rmsve at the end of each evaluation episode # size: num_episode / episode_eval_frequency + 1 (includes evaluation at the beginning of training) agent_rmsve = np.zeros( int(experiment_parameters["num_episodes"] / experiment_parameters["episode_eval_frequency"]) + 1) # save learned state value at the end of each run agent_state_val = np.zeros(environment_parameters["num_states"]) env_info = { "num_states": environment_parameters["num_states"], "start_state": environment_parameters["start_state"], "left_terminal_state": environment_parameters["left_terminal_state"], "right_terminal_state": environment_parameters["right_terminal_state"] } agent_info = { "num_states": environment_parameters["num_states"], "num_groups": num_agg_states, "step_size": step_size, "discount_factor": environment_parameters["discount_factor"] } print('Setting - num. agg. states: {}, step_size: {}'.format( num_agg_states, step_size)) os.system('sleep 0.2') # one agent setting for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) # Compute initial RMSVE before training current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[0] += calc_RMSVE(current_V) for episode in range(1, experiment_parameters["num_episodes"] + 1): # run episode rl_glue.rl_episode(0) # no step limit if episode % experiment_parameters[ "episode_eval_frequency"] == 0: current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[int( episode / experiment_parameters["episode_eval_frequency"] )] += calc_RMSVE(current_V) # store only one run of state value if run == 50: agent_state_val = rl_glue.rl_agent_message( "get state value") # rmsve averaged over runs agent_rmsve /= experiment_parameters["num_runs"] save_name = "{}_agg_states_{}_step_size_{}".format( 'TD_agent', num_agg_states, step_size).replace('.', '') if not os.path.exists('results'): os.makedirs('results') # save avg. state value np.save("results/V_{}".format(save_name), agent_state_val) # save avg. rmsve np.save("results/RMSVE_{}".format(save_name), agent_rmsve)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = [] average_sum_reward = [] env_info = environment_parameters agent_info = agent_parameters rl_glue.rl_init(agent_info, env_info) starting_episode = 0 gym_name = env_info['gym_environment'] agent_name = agent_info['name'] save_name = "{}.npy".format(rl_glue.agent.name) npy_path = os.path.join(rl_glue.agent.checkpoint_dir, "sum_reward_{}".format(save_name)) fig_path = os.path.join(rl_glue.agent.checkpoint_dir, 'sum_rewards.png') # load checkpoint if any if experiment_parameters['load_checkpoint'] is not None: rl_glue.agent.load_checkpoint(experiment_parameters['load_checkpoint']) agent_sum_reward, average_sum_reward = np.load(npy_path) agent_sum_reward = list(agent_sum_reward) average_sum_reward = list(average_sum_reward) fname = experiment_parameters['load_checkpoint'].split(os.path.sep)[-1] try: starting_episode = int(fname.split('_')[1]) except IndexError: starting_episode = len(agent_sum_reward) print(f"starting from episode {starting_episode}") for episode in tqdm( range(1 + starting_episode, experiment_parameters["num_episodes"] + 1)): # run episode rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward.append(episode_reward) if episode % experiment_parameters['print_freq'] == 0: print('Episode {}/{} | Reward {}'.format( episode, experiment_parameters['num_episodes'], episode_reward)) average = get_average(agent_sum_reward) average_sum_reward.append(average) if episode % experiment_parameters['checkpoint_freq'] == 0: rl_glue.agent.save_checkpoint(episode) savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path, agent_name, gym_name) if env_info['solved_threshold'] is not None and average >= env_info[ 'solved_threshold']: print("Task Solved with reward = {}".format(episode_reward)) rl_glue.agent.save_checkpoint(episode, solved=True) break savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path, agent_name, gym_name)
import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": max_steps = 8000 num_runs = 1 # Create and pass agent and environment objects to RLGlue environment = WindygridEnvironment() agent = SarsaAgent() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore for run in range(num_runs): episode=[] time_step=[] rlglue.rl_init() while True: rlglue.rl_episode() time_step.append(rlglue.num_steps()) episode.append(rlglue.num_episodes()) if rlglue.num_steps() > 8000: break plt.plot(time_step,episode,label="8 actions") plt.xticks([0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]) plt.xlabel('Time steps') plt.ylabel('Episode', rotation=90) plt.legend(loc=2) plt.show() # save average value function numpy object, to be used by plotting script
for algorithm in ["Expected Sarsa", "Q-learning"]: all_reward_sums[algorithm] = [] all_state_visits[algorithm] = [] for run in tqdm(range(num_runs)): agent_info["seed"] = run rl_glue = RLGlue(env, agents[algorithm]) rl_glue.rl_init(agent_info, env_info) reward_sums = [] state_visits = np.zeros(agent_info["num_states"]) # last_episode_total_reward = 0 for episode in range(num_episodes): start_time = time.clock() if episode < num_episodes - 10: # Runs an episode rl_glue.rl_episode(0) else: # Runs an episode while keeping track of visited states state, action = rl_glue.rl_start() state_visits[state] += 1 is_terminal = False while not is_terminal: # # stop the program # line = sys.stdin.readline() # print 'line=', line # if line == 'q': # sys.exit() reward, state, action, is_terminal = rl_glue.rl_step() state_visits[state] += 1 reward_sums.append(rl_glue.rl_return())
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): """ Assume environment_parameters dict contains: { input_dim: integer, num_actions: integer, discount_factor: float } Assume agent_parameters dict contains: { step_size: 1D numpy array of floats, tau: 1D numpy array of floats } Assume experiment_parameters dict contains: { num_runs: integer, num_episodes: integer } """ ### Instantiate rl_glue from RLGlue rl_glue = RLGlue(environment, agent) os.system('sleep 1') # to prevent tqdm printing out-of-order ### Initialize agent_sum_reward to zero in the form of a numpy array # with shape (number of values for tau, number of step-sizes, number of runs, number of episodes) agent_sum_reward = np.zeros((len(agent_parameters["tau"]), len(agent_parameters["step_size"]), experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) # for loop over different values of tau # tqdm is used to show a progress bar for completing the parameter study for i in tqdm(range(len(agent_parameters["tau"]))): # for loop over different values of the step-size for j in range(len(agent_parameters["step_size"])): ### Specify env_info env_info = {} ### Specify agent_info agent_info = {"num_actions": environment_parameters["num_actions"], "input_dim": environment_parameters["input_dim"], "discount_factor": environment_parameters["discount_factor"], "tau": agent_parameters["tau"][i], "step_size": agent_parameters["step_size"][j]} # for loop over runs for run in range(experiment_parameters["num_runs"]): # Set the seed agent_info["seed"] = agent_parameters["seed"] * experiment_parameters["num_runs"] + run # Beginning of the run rl_glue.rl_init(agent_info, env_info) for episode in range(experiment_parameters["num_episodes"]): # Run episode rl_glue.rl_episode(0) # no step limit ### Store sum of reward agent_sum_reward[i, j, run, episode] = rl_glue.rl_agent_message("get_sum_reward") if not os.path.exists('results'): os.makedirs('results') save_name = "{}".format(rl_glue.agent.name).replace('.','') # save sum reward np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
else: raise Exception('Invalid agent name in exp') i = 0 for key in sorted_params_names: msg_to_send = key + ' ' + str(grid_search_params[key][combination[i]]) print(msg_to_send) agent.agent_message(msg_to_send) i += 1 rl_glue = RLGlue(agent_obj=agent, env_obj=env) for r in range(int(initial_params['num_runs'])): print('run: ' + str(r)) agent.random_seed = r rl_glue.rl_init() agent.epsilon = float(initial_params['initial_epsilon']) for e in range(int(initial_params['total_episodes'])): rl_glue.rl_episode(max_episode_steps) agent.epsilon = compute_epsilon(current_epsilon=agent.epsilon) episodes_steps[j, r, e] = rl_glue.num_ep_steps() Q_t[j, r, e] = agent.Q if initial_params['agent'] != 'sarsa0': Phi_t[j, r, e] = agent.Phi if 'pies' in initial_params['agent']: agent.xi = compute_xi(current_xi=agent.xi, decay=agent.decay, decay_param=agent.decay_param) print('path length', len(agent.path)) j += 1 # finding the best parameter setting for grid search based on AUC best_param_set_index = np.random.choice( np.flatnonzero( np.trapz(np.mean(episodes_steps, 1)) == np.trapz(np.mean(episodes_steps, 1)).min()))
'beta_m': 0.9, 'beta_v': 0.999, 'epsilon': 1e-8 }, 'replay_buffer_size': 50000, 'minibatch_sz': 8, 'num_replay_updates_per_step': 4, 'gamma': 0.99, 'tau': 0.001 } current_env = LunarLanderEnvironment current_agent = Agent rlglue = RLGlue(current_env, current_agent) env_info = {} agent_info = agent_parameters for run in range(1, experiment_parameters["num_runs"] + 1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run env_info["seed"] = run rlglue.rl_init(agent_info, env_info) for episode in range(1, experiment_parameters["num_episodes"] + 1): rlglue.rl_episode(experiment_parameters["timeout"]) episode_reward = rlglue.rl_agent_message("get_sum_reward") print("episode:", episode, " reward:", episode_reward)
from rl_glue import RLGlue from windy_env import WindyEnvironment from sarsa_agent import SarsaAgent import numpy as np import matplotlib.pyplot as plt max_steps = 8000 steps = 0 episodes = 0 ep_list = [] step_list = [] environment = WindyEnvironment() agent = SarsaAgent() rl = RLGlue(environment, agent) rl.rl_init() while steps < max_steps: rl.rl_episode(max_steps) steps = rl.num_steps() episodes = rl.num_episodes() # print(steps, episodes) ep_list.append(episodes) step_list.append(steps) plt.xlabel('Time steps') plt.ylabel('Episodes') plt.plot(step_list, ep_list) plt.show()