def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) np.random.seed(0) num_episodes = 200 see_eps = [157] num_runs = 1 max_eps_steps = 100000 # test with various stepsizes (alphas) for agent stepSizes = np.linspace(0.01, 1, 100) # best stepsize so far (comment out to test many) stepSizes = [0.559184] # seperate run for each stepsize for step in stepSizes: # initialize agent and software, with chosen stepsize rlglue.rl_init() rlglue.rl_agent_message('step:' + str(step)) # keep track of total rewards for each episode total_rewards = [] for ep in range(num_episodes): # render only selected episodes if ep in see_eps: rlglue.rl_env_message('rOFF') if ep + 1 in see_eps: rlglue.rl_env_message('rON') print("Episode %d" % (ep + 1)) # initializse for episode rlglue.rl_start() terminal = False total_reward = 0 # run episode and calculate total reward while not terminal: reward, state, action, terminal = rlglue.rl_step() total_reward += reward total_rewards.append(total_reward) # calculate average reward of the last 100 episodes if ep >= 99: total = np.sum(total_rewards[ep - 99:ep + 1]) avg = total / 100 # check if results indicate the problem is solved if avg > -110: print("Solved at episode %d, avg reward: %f" % (ep + 1, avg)) break # close environment environment.close()
def run_experiment(): #specify hyper-parameters num_runs = 1 max_episodes = 1000000 max_steps_per_episode = 100 num_states = 181 num_actions = 2 alpha = 0.01 eps = 0.1 Q1 = 0 results = np.zeros(max_episodes) results_run = 0 agent = RandomAgent(num_states, num_actions, alpha, eps, Q1) environment = BlackJack() rlglue = RLGlue(environment, agent) print( "\nPrinting one dot for every run: {0} total runs to complete".format( num_runs)) for run in range(num_runs): np.random.seed(run) results_run = 0.0 rlglue.rl_init() for e in range(1, max_episodes + 1): rlglue.rl_start() for s in range(max_steps_per_episode): r, _, _, terminal = rlglue.rl_step() results_run += r results[e - 1] += r if terminal: break if e % 10000 == 0: print( "\nEpisode {}: average return till episode is {}, and policy is" .format(e, results_run / e)) print(rlglue.rl_agent_message("printPolicy")) print(".") print("Average return over experiment: {}".format( (results / num_runs).mean())) #save final policy to file -- change file name as necessary with open("policy.txt", 'w') as f: f.write(rlglue.rl_agent_message("printPolicy")) #save all the experiment data for analysis -- change file name as necessary save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): #print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) np.save('steps', steps) plotGraph() del agent, environment, rlglue agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) #np.save('steps', steps) #plotGraph() rlglue.rl_agent_message("plot3DGraph")
def question_3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 1000000 for _ in range(num_runs): rlglue.rl_init() i = 0 for i in range(num_episodes): rlglue.rl_episode(max_eps_steps) print(i) fout = open('value', 'w') steps = 50 w, iht = rlglue.rl_agent_message("ValueFunction") Q = np.zeros([steps, steps]) for i in range(steps): for j in range(steps): values = [] for a in range(3): value = 0 for index in tiles(iht, 8, [ 8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 * (-0.07 + (j * 0.14 / steps)) / 0.14 ], [a]): value -= w[index] values.append(value) height = max(values) fout.write(repr(height) + ' ') Q[j][i] = height fout.write('\n') fout.close() np.save("value", Q)
def main(): num_eps = 5000 num_runs = 10 random.seed(0) np.random.seed(0) agent = Agent() env = Environment() rlglue = RLGlue(env, agent) del agent, env for run in range(num_runs): rlglue.rl_init() performances = [] for ep in range(num_eps): rlglue.rl_start() #rlglue.rl_env_message('renderON') terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() # Find the first policy that performs at 100% performance = testPolicy(rlglue.rl_agent_message('policy')) * 100 performances.append(performance) if performance >= 100: #print(rlglue.rl_agent_message('policy')) print('Episode: %d' % (ep + 1)) break plt.plot(performances) plt.savefig('test.png')
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = np.zeros((experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) env_info = {} agent_info = agent_parameters # one agent setting for run in range(1, experiment_parameters["num_runs"]+1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run env_info["seed"] = run rl_glue.rl_init(agent_info, env_info) for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)): # run episode rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward[run - 1, episode - 1] = episode_reward save_name = "{}".format(rl_glue.agent.name) if not os.path.exists('results'): os.makedirs('results') np.save("results/sum_reward_{}".format(save_name), agent_sum_reward) shutil.make_archive('results', 'zip', 'results')
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 num_runs = 1 numActions=3 rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) weights = rlglue.rl_agent_message("3D plot of the cast-to-go") fout = open('value','w') steps = 50 z = np.zeros((50,50)) for i in range(steps): for j in range(steps): values = [] for a in range(numActions): tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14] inds = agent.get_index(tile,a) values.append(np.sum([weights[i] for i in inds])) height = max(values) z[j][i]=-height fout.write(repr(-height)+' ') fout.write('\n') fout.close() fig = plt.figure() ax = fig.add_subplot(111,projection ='3d') x = np.arange(-1.2,0.5,1.7/50) y = np.arange(-0.07,0.07,0.14/50) x,y = np.meshgrid(x,y) ax.set_xticks([-1.2, 0.5]) ax.set_yticks([0.07, -0.07]) ax.set_ylabel('Velocity') ax.set_xlabel('Position') ax.set_zlabel('Cost-To-Go') ax.plot_surface(x,y,z) plt.savefig('cost-to-go.png') plt.show() np.save('steps', steps)
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) # only 1 run for r in range(num_runs): print("1000 episode run : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # get the list of value functions [X,Y,Z] represents position, velocity, state-value Return = rlglue.rl_agent_message(1) return Return
def question_2(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 rlglue.rl_init() for _ in tqdm(range(num_episodes)): rlglue.rl_episode(max_eps_steps) q3_plot = rlglue.rl_agent_message("plot") fig = plt.figure() ax = fig.gca(projection='3d') X, Y = np.meshgrid(q3_plot[0], q3_plot[1]) surf = ax.plot_surface(X, Y, q3_plot[2]) ax.set_xlim(q3_plot[0][0], q3_plot[0][-1]) ax.set_ylim(q3_plot[1][0], q3_plot[1][-1]) plt.show()
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 num_action = 3 for r in range(num_runs): print("run number : ", r) #np.random.seed(r) rlglue.rl_init() for _ in range(num_episodes): rlglue.rl_episode(max_eps_steps) weight = rlglue.rl_agent_message('get weight') # algorithm from assignment #fout = open('value','w') steps = 50 neg_q_hat = np.zeros((steps, steps)) for i in range(steps): for j in range(steps): values = [] position = -1.2 + (i * 1.7 / steps) velocity = -0.07 + (j * 0.14 / steps) for a in range(num_action): tile_idx = agent.plot_get_feature(position, velocity, a) q_hat = np.sum(weight[tile_idx]) values.append(q_hat) height = np.max(values) neg_q_hat[j][i] = -height #fout.write(repr(-height)+' ') #fout.write('\n') #fout.close() np.save('neg_q_hat', neg_q_hat)
def question_3(): num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) for r in range(num_runs): start = time.time() print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) end = time.time() print(str(end - start) + " seconds elapsed") action_vals, pos, vel = rlglue.rl_agent_message("return info") action_vals = np.multiply(action_vals, -1) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(pos, vel, action_vals) plt.show()
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) agent_sum_reward = np.zeros((experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) env_info = {} agent_info = agent_parameters for run in range(1, experiment_parameters["num_runs"]+1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run agent_info["network_pickle"] = "network500.pickle" env_info["seed"] = run env_info["render"] = True rl_glue.rl_init(agent_info, env_info) for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)): rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward[run - 1, episode - 1] = episode_reward
from rl_glue import RLGlue from mountain_car_env import MountainCarEnvironment from sarsa_agent import SarsaAgent num_runs = 10 num_episodes = 300 env_info = {"num_tiles": 8, "num_tilings": 8} agent_info = {} all_steps = [] agent = SarsaAgent env = MountainCarEnvironment for run in range(num_runs): rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) for episode in range(num_episodes + 1): rl_glue.rl_episode(15000) r = rl_glue.rl_agent_message("get_reward") print("episode:", episode, "reward:", r)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for actor_ss in agent_parameters["actor_step_size"]: for critic_ss in agent_parameters["critic_step_size"]: for avg_reward_ss in agent_parameters[ "avg_reward_step_size"]: env_info = {} agent_info = { "num_tilings": num_tilings, "num_tiles": num_tiles, "actor_step_size": actor_ss, "critic_step_size": critic_ss, "avg_reward_step_size": avg_reward_ss, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"] } # results to save return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars for run in tqdm( range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters[ 'max_steps']: num_steps += 1 rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward return_arr.append(reward) avg_reward = rl_glue.rl_agent_message( "get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward if not os.path.exists('results'): os.makedirs('results') save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format( num_tilings, num_tiles, actor_ss, critic_ss, avg_reward_ss) total_return_filename = "results/{}_total_return.npy".format( save_name) exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format( save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
'beta_m': 0.9, 'beta_v': 0.999, 'epsilon': 1e-8 }, 'replay_buffer_size': 50000, 'minibatch_sz': 8, 'num_replay_updates_per_step': 4, 'gamma': 0.99, 'tau': 0.001 } current_env = LunarLanderEnvironment current_agent = Agent rlglue = RLGlue(current_env, current_agent) env_info = {} agent_info = agent_parameters for run in range(1, experiment_parameters["num_runs"] + 1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run env_info["seed"] = run rlglue.rl_init(agent_info, env_info) for episode in range(1, experiment_parameters["num_episodes"] + 1): rlglue.rl_episode(experiment_parameters["timeout"]) episode_reward = rlglue.rl_agent_message("get_sum_reward") print("episode:", episode, " reward:", episode_reward)
from rl_glue import RLGlue from sarsa_agent import sarsaAgent from windygridworld_env import windyGridenv import matplotlib.pyplot as plt if __name__ == "__main__": max_steps = 8000 num_runs = 10 for n in range(4): # Create and pass agent and environment objects to RLGlue environment = windyGridenv() agent = sarsaAgent() rlglue = RLGlue(environment, agent) if n == 0: rlglue.rl_agent_message("alpha = 0.3") message = "alpha = 0.3" elif n == 1: rlglue.rl_agent_message("alpha = 0.5") message = "alpha = 0.5" elif n == 2: rlglue.rl_agent_message("alpha = 0.7") message = "alpha = 0.7" else: rlglue.rl_agent_message("alpha = 0.9") message = "alpha = 0.9" rlglue.rl_agent_message("epsilon = 0.1") rlglue.rl_agent_message("4") del agent, environment # don't use these anymore time_steps = []
# np.random.seed(count) for event in pygame.event.get(): if event.type == pygame.QUIT: Q = rlglue.rl_agent_message('Q') print(Q) done = True V, state, env_map = learn_step() draw_step(V, state, env_map) pygame.display.flip() if __name__ == "__main__": # main() rlglue.rl_init() while not done: run() count += 1 if time_step % 100 == 1: count_episode = rlglue.rl_agent_message('COUNT') print('time_step: {:d}, count: {:d}'.format( time_step, count_episode)) time_step += 1 # print(count) # sleep(0.5)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = [] average_sum_reward = [] env_info = environment_parameters agent_info = agent_parameters rl_glue.rl_init(agent_info, env_info) starting_episode = 0 gym_name = env_info['gym_environment'] agent_name = agent_info['name'] save_name = "{}.npy".format(rl_glue.agent.name) npy_path = os.path.join(rl_glue.agent.checkpoint_dir, "sum_reward_{}".format(save_name)) fig_path = os.path.join(rl_glue.agent.checkpoint_dir, 'sum_rewards.png') # load checkpoint if any if experiment_parameters['load_checkpoint'] is not None: rl_glue.agent.load_checkpoint(experiment_parameters['load_checkpoint']) agent_sum_reward, average_sum_reward = np.load(npy_path) agent_sum_reward = list(agent_sum_reward) average_sum_reward = list(average_sum_reward) fname = experiment_parameters['load_checkpoint'].split(os.path.sep)[-1] try: starting_episode = int(fname.split('_')[1]) except IndexError: starting_episode = len(agent_sum_reward) print(f"starting from episode {starting_episode}") for episode in tqdm( range(1 + starting_episode, experiment_parameters["num_episodes"] + 1)): # run episode rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward.append(episode_reward) if episode % experiment_parameters['print_freq'] == 0: print('Episode {}/{} | Reward {}'.format( episode, experiment_parameters['num_episodes'], episode_reward)) average = get_average(agent_sum_reward) average_sum_reward.append(average) if episode % experiment_parameters['checkpoint_freq'] == 0: rl_glue.agent.save_checkpoint(episode) savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path, agent_name, gym_name) if env_info['solved_threshold'] is not None and average >= env_info[ 'solved_threshold']: print("Task Solved with reward = {}".format(episode_reward)) rl_glue.agent.save_checkpoint(episode, solved=True) break savefig(agent_sum_reward, average_sum_reward, npy_path, fig_path, agent_name, gym_name)
np.save("ground_truth.npy", truth) num_episodes = 100000 v = np.zeros(1000) environment = Environment() agent = Gradient_MC() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore rlglue.rl_init() for episode in tqdm(range(num_episodes)): rlglue.rl_episode() aggregated_v = rlglue.rl_agent_message("ValueFunction") distribution = rlglue.rl_agent_message("distribution") for i in range(1000): v[i] = aggregated_v[i // (1000 // aggregated_v.shape[0])] x = np.arange(1000) # plt.plot(x, truth) # plt.plot(x, v) # plt.show() fig, ax1 = plt.subplots() ax2 = ax1.twinx() ax1.set_xlabel('State') ax1.set_ylabel('Value Scale')
print("run number: {}\n".format(run)) # set seed for reproducibility np.random.seed(run) # initialize RL-Glue rlglue.rl_init() # loop over episodes for episode in range(num_episodes): #print("episode{}".format(episode)) # run episode with the allocated steps budget rlglue.rl_episode() if episode % 10 == 0: V_hat = rlglue.rl_agent_message("Estimate value function") #print(V_hat) #vhat_arr.append(V_hat) # reference: https://stackoverflow.com/questions/21926020/how-to-calculate-rmse-using-ipython-numpy RMSE = np.sqrt(np.mean((Vs - V_hat)**2)) #print(RMSE) result[int(episode / 10)] += RMSE result = result / num_runs output.append(result) print('total time of executing 30 rums with {} agent is {:3}s'.format( item, time.time() - start_time)) print(result) #np.savez('randomwalk.npz',tabular = output[0],tile_coding = output[1] ) np.save('randomwalk', output)
def main(): # Seed rng's for consistent testing random.seed(0) np.random.seed(0) # Generate agent, environment and RLGlue env = Environment() agent = Agent(env.get_actions()) rlglue = RLGlue(env, agent) del agent, env # Configure experiment num_eps = 100000 # initialize rlglue rlglue.rl_init() avg_rewards = [] avg_reward = 0 max_reward = 0 best_policy = None # Run through each episode #rlglue.rl_env_message('renderON') #for ep in range(num_eps): ep = 0 while ep < num_eps: ep += 1 #if ep % int(num_eps/10) == 0: #print('ep:', ep, 'bestpolicy', max_reward) # start episode rlglue.rl_start() rewards = 0 steps = 1 # Run episode to its completion terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() rewards += reward steps += 1 avg_reward = rewards avg_rewards.append(avg_reward) if rewards > max_reward: max_reward = rewards best_policy = rlglue.rl_agent_message('policy') pickle.dump(best_policy, open("policy.pickle", "wb")) print('ep', ep, 'reward', avg_reward) #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps) #print(rlglue.rl_agent_message('policy')) #input() plt.plot(avg_rewards) plt.plot(moving_average(avg_rewards, 10)) plt.plot(moving_average(avg_rewards, 100)) plt.savefig('results.png') # Get generated policy policy = rlglue.rl_agent_message('policy') # Test policy result = testPolicy(best_policy)
from grid_env import GridEnvironment from dynaq_agent import DynaqAgent import numpy as np import time import matplotlib.pyplot as plt if __name__ == "__main__": start_time = time.time() max_steps = 8000 num_runs = 10 num_episodes = 50 # Create and pass agent and environment objects to RLGlue environment =GridEnvironment() agent = DynaqAgent() rlglue = RLGlue(environment, agent) rlglue.rl_agent_message('n = 0') del agent, environment # don't use these anymore steps1 = {} L1 = [] L2 = [0]*num_episodes for episode in range(num_episodes): steps1[episode]=[] L1.append(episode+1) for run in range(num_runs): np.random.seed(run) rlglue.rl_init() step = 0 for episode in range(num_episodes): rlglue.rl_episode(max_steps) new_step = rlglue.num_steps() steps1[episode].append(new_step-step)
# set seed for reproducibility np.random.seed(run) # initialize RL-Glue rlglue.rl_init() # loop over episodes for episode in range(num_episodes): print("episode{}".format(episode)) # run episode with the allocated steps budget rlglue.rl_episode(max_steps) # if episode is one of the key episodes, extract and save value # function if episode in key_episodes: V = np.fromstring(rlglue.rl_agent_message('ValueFunction'),dtype='float') v_over_runs[episode].append(V) print('total time of executing 10 rums is {:3}s'.format(time.time()-start_time)) # extract length of key_episodes n_valueFunc = len(key_episodes) # extract number of states via length of a particular value function n = v_over_runs[key_episodes[0]][0].shape[0] # initialize data structure for average value function at key_episodes average_v_over_runs = np.zeros((n_valueFunc,n)) # average across runs at various episodes, to estimate average value # function at episode for i, episode in enumerate(key_episodes): # each item in v_over_runs[episode] is a list (one item per run),
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): """ Assume environment_parameters dict contains: { input_dim: integer, num_actions: integer, discount_factor: float } Assume agent_parameters dict contains: { step_size: 1D numpy array of floats, tau: 1D numpy array of floats } Assume experiment_parameters dict contains: { num_runs: integer, num_episodes: integer } """ ### Instantiate rl_glue from RLGlue rl_glue = RLGlue(environment, agent) os.system('sleep 1') # to prevent tqdm printing out-of-order ### Initialize agent_sum_reward to zero in the form of a numpy array # with shape (number of values for tau, number of step-sizes, number of runs, number of episodes) agent_sum_reward = np.zeros((len(agent_parameters["tau"]), len(agent_parameters["step_size"]), experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) # for loop over different values of tau # tqdm is used to show a progress bar for completing the parameter study for i in tqdm(range(len(agent_parameters["tau"]))): # for loop over different values of the step-size for j in range(len(agent_parameters["step_size"])): ### Specify env_info env_info = {} ### Specify agent_info agent_info = {"num_actions": environment_parameters["num_actions"], "input_dim": environment_parameters["input_dim"], "discount_factor": environment_parameters["discount_factor"], "tau": agent_parameters["tau"][i], "step_size": agent_parameters["step_size"][j]} # for loop over runs for run in range(experiment_parameters["num_runs"]): # Set the seed agent_info["seed"] = agent_parameters["seed"] * experiment_parameters["num_runs"] + run # Beginning of the run rl_glue.rl_init(agent_info, env_info) for episode in range(experiment_parameters["num_episodes"]): # Run episode rl_glue.rl_episode(0) # no step limit ### Store sum of reward agent_sum_reward[i, j, run, episode] = rl_glue.rl_agent_message("get_sum_reward") if not os.path.exists('results'): os.makedirs('results') save_name = "{}".format(rl_glue.agent.name).replace('.','') # save sum reward np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for update_ss in agent_parameters["update_step_size"]: for avg_reward_ss in agent_parameters["avg_reward_step_size"]: for epsilon in agent_parameters["epsilon"]: env_info = {} agent_info = {"num_tilings": num_tilings, "num_tiles": num_tiles, "alpha": update_ss, "avg_reward_step_size": avg_reward_ss, "epsilon":epsilon, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"]} # results to save return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars avg_reward_list = [] avg_reward = -10000 for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. #return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters['max_steps']: num_steps += 1 rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward #return_arr.append(reward) avg_reward = rl_glue.rl_agent_message("get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward avg_reward_list.append(avg_reward) print(np.average(avg_reward_list)) if not os.path.exists('results_sarsa'): os.makedirs('results_sarsa') save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format( num_tilings, num_tiles, update_ss, epsilon, avg_reward_ss, experiment_parameters["max_steps"]) total_return_filename = "results_sarsa/{}_total_return.npy".format(save_name) exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format(save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
else: truth = ground_truth() np.save("ground_truth.npy", truth) num_episodes = 2000 num_runs = 30 rmse_tabular = np.zeros(num_episodes // 10) rmse_tile = np.zeros(num_episodes // 10) environment = Environment() agent = TD_0() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore rlglue.rl_agent_message("tabular") for run in tqdm(range(num_runs)): np.random.seed(run) rlglue.rl_init() for episode in range(num_episodes): rlglue.rl_episode() if episode % 10 == 0: v = rlglue.rl_agent_message("ValueFunction") rmse_tabular[episode // 10] += np.sqrt(np.mean((truth - v)**2)) rlglue.rl_agent_message("tile") for run in tqdm(range(num_runs)): np.random.seed(run) rlglue.rl_init() for episode in range(num_episodes):
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # Sweep Agent parameters for num_agg_states in agent_parameters["num_groups"]: for step_size in agent_parameters["step_size"]: # save rmsve at the end of each evaluation episode # size: num_episode / episode_eval_frequency + 1 (includes evaluation at the beginning of training) agent_rmsve = np.zeros( int(experiment_parameters["num_episodes"] / experiment_parameters["episode_eval_frequency"]) + 1) # save learned state value at the end of each run agent_state_val = np.zeros(environment_parameters["num_states"]) env_info = { "num_states": environment_parameters["num_states"], "start_state": environment_parameters["start_state"], "left_terminal_state": environment_parameters["left_terminal_state"], "right_terminal_state": environment_parameters["right_terminal_state"] } agent_info = { "num_states": environment_parameters["num_states"], "num_groups": num_agg_states, "step_size": step_size, "discount_factor": environment_parameters["discount_factor"] } print('Setting - num. agg. states: {}, step_size: {}'.format( num_agg_states, step_size)) os.system('sleep 0.2') # one agent setting for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) # Compute initial RMSVE before training current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[0] += calc_RMSVE(current_V) for episode in range(1, experiment_parameters["num_episodes"] + 1): # run episode rl_glue.rl_episode(0) # no step limit if episode % experiment_parameters[ "episode_eval_frequency"] == 0: current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[int( episode / experiment_parameters["episode_eval_frequency"] )] += calc_RMSVE(current_V) # store only one run of state value if run == 50: agent_state_val = rl_glue.rl_agent_message( "get state value") # rmsve averaged over runs agent_rmsve /= experiment_parameters["num_runs"] save_name = "{}_agg_states_{}_step_size_{}".format( 'TD_agent', num_agg_states, step_size).replace('.', '') if not os.path.exists('results'): os.makedirs('results') # save avg. state value np.save("results/V_{}".format(save_name), agent_state_val) # save avg. rmsve np.save("results/RMSVE_{}".format(save_name), agent_rmsve)
from rl_glue import RLGlue from windy_env import WindyEnvironment from n_step_sarsa_agent import SarsaAgent import numpy as np import time import matplotlib.pyplot as plt if __name__ == "__main__": start_time = time.time() max_steps = 8000 # Create and pass agent and environment objects to RLGlue environment = WindyEnvironment() agent = SarsaAgent() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore rlglue.rl_init() L1 = [] L2 = [] n = rlglue.rl_agent_message('n') a = rlglue.rl_agent_message('a') while rlglue.num_steps() < max_steps: L1.append(rlglue.num_steps()) rlglue.rl_episode(10000) episodes = rlglue.num_episodes() L2.append(episodes) plt.title(str(n) + '-step sarsa with ' + str(a) + " actions") plt.plot(L1, L2) plt.show()
print("training process with {} planning step".format(ite)) # Create and pass agent and environment objects to RLGlue environment = DynaQEnvironment() agent = DynaQAgent(ite) rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore for run in range(num_runs): print("run number: {}\n".format(run)) # set seed for reproducibility np.random.seed(run) # initialize RL-Glue rlglue.rl_init() # loop over episodes for episode in range(num_episodes): rlglue.rl_episode() result[episode] += rlglue.num_ep_steps() data = rlglue.rl_agent_message( "Q for all states in the episode") Q.append(data) result = result / num_runs output.append(result) np.save("output", output)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save rmsve at the end of each episode agent_rmsve = np.zeros( (experiment_parameters["num_runs"], int(experiment_parameters["num_episodes"] / experiment_parameters["episode_eval_frequency"]) + 1)) # save learned state value at the end of each run agent_state_val = np.zeros((experiment_parameters["num_runs"], environment_parameters["num_states"])) env_info = { "num_states": environment_parameters["num_states"], "start_state": environment_parameters["start_state"], "left_terminal_state": environment_parameters["left_terminal_state"], "right_terminal_state": environment_parameters["right_terminal_state"] } agent_info = { "num_states": environment_parameters["num_states"], "num_hidden_layer": agent_parameters["num_hidden_layer"], "num_hidden_units": agent_parameters["num_hidden_units"], "step_size": agent_parameters["step_size"], "discount_factor": environment_parameters["discount_factor"], "beta_m": agent_parameters["beta_m"], "beta_v": agent_parameters["beta_v"], "epsilon": agent_parameters["epsilon"] } print('Setting - Neural Network with 100 hidden units') os.system('sleep 1') # one agent setting for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) # Compute initial RMSVE before training current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[run - 1, 0] = calc_RMSVE(current_V) for episode in range(1, experiment_parameters["num_episodes"] + 1): # run episode rl_glue.rl_episode(0) # no step limit if episode % experiment_parameters["episode_eval_frequency"] == 0: current_V = rl_glue.rl_agent_message("get state value") agent_rmsve[run - 1, int(episode / experiment_parameters["episode_eval_frequency"] )] = calc_RMSVE(current_V) elif episode == experiment_parameters[ "num_episodes"]: # if last episode current_V = rl_glue.rl_agent_message("get state value") agent_state_val[run - 1, :] = current_V save_name = "{}".format(rl_glue.agent.name).replace('.', '') if not os.path.exists('results'): os.makedirs('results') # save avg. state value np.save("results/V_{}".format(save_name), agent_state_val) # save avg. rmsve np.savez("results/RMSVE_{}".format(save_name), rmsve=agent_rmsve, eval_freq=experiment_parameters["episode_eval_frequency"], num_episodes=experiment_parameters["num_episodes"])