def fig_10_2(): fig, ax = plt.subplots() qhat, nab_qhat = get_qhats(N_TIL, N_TLGS) for alp in FIG_10_2_ALP_L: print(f"[alpha={alp}]") tot_n_steps = np.zeros(FIG_10_2_N_EP) for seed in range(FIG_10_2_N_RUNS): print(f"[RUN #{seed}]") alg = EpisodicSemiGradientTD0(MountainCar(), alp, N_TIL * N_TLGS, eps=0) alg.seed(seed) tot_n_steps += np.array( alg.pol_eva(qhat, nab_qhat, FIG_10_2_N_EP, FIG_10_2_G)) plt.plot(tot_n_steps / FIG_10_2_N_RUNS, label=f'alpha={alp}') plt.yscale('log') xticks, yticks = [0, 500], [100, 200, 400, 1000] plot_figure(ax, 'Figure 10.2', xticks, xticks, 'Episode', yticks, yticks, 'Steps\nper episode\n(log scale)') fig.set_size_inches(20, 14) plt.legend() save_plot('fig10.2', dpi=100) plt.show()
def mountain_car(self, i): qrl = MountainCar(env_name='MountainCar-v0', learning_rate=0.97**i, discount=0.99**i, iterations=1000) qrl.run() num_rewards = len(qrl.rewards) return self.rolling_mean(qrl.rewards, window=num_rewards // 100, strides=num_rewards // 50)
def fig_10_4(): fig, ax = plt.subplots() qhat, nab_qhat = get_qhats(N_TIL, N_TLGS) alg = nStepSemiGradSarsa(MountainCar(), 0, N_TIL * N_TLGS, 0, 0) for n in FIG_10_4_ALP_BND: alg.n = n print(f"[n={n}]") steps_l = [] alpha_l = np.linspace(*FIG_10_4_ALP_BND[n], FIG_10_4_ALP_PTS) for alpha in alpha_l: alg.a = alpha / N_TLGS print(f"[alpha={alg.a}]") tot_steps = 0 for seed in range(FIG_10_4_N_RUNS): print(f"[RUN #{seed}]") alg.reset() alg.seed(seed) for ep in range(FIG_10_4_N_EP): tot_steps += alg.pol_eva(None, qhat, nab_qhat, 1, FIG_10_4_G, max_steps=1000)[0] steps_l.append(tot_steps / (FIG_10_4_N_RUNS * FIG_10_4_N_EP)) plt.plot(alpha_l, steps_l, label=f'n={n}') xticks, yticks = np.linspace(0, 1.5, 4), np.linspace(220, 300, 5) left_title = ( f'Mountain Car\nSteps per\nepisode\n(averaged \nover ' + f'first\n{FIG_10_4_N_EP} episodes\nand {FIG_10_4_N_RUNS} runs') plot_figure(ax, 'Figure 10.4', list(xticks) + [1.8], xticks, f'alpha * number of tilings ({N_TLGS})', yticks, yticks, left_title, labelpad=20) fig.set_size_inches(20, 14) plt.legend() save_plot('fig10.4', dpi=100) plt.show()
def fig_12_11(): fig, ax = plt.subplots() F, qhat = get_fn_mc(N_TIL, N_TLGS) for alg_name in FIG_12_11_ALG_STR.keys(): steps_l = [] alpha_l = np.linspace(*FIG_12_11_ALP_BND[alg_name], FIG_12_11_N_PTS) for alpha in alpha_l: alg = alg_name(MountainCar(), alpha / N_TLGS, N_TIL * N_TLGS, FIG_12_11_LAM, F, qhat, FIG_12_11_EPS, FIG_12_11_G) print(f"[ALPHA={alg.a}]") tot_steps = 0 for seed in range(FIG_12_11_N_RUNS): print(f"[RUN #{seed}]") alg.reset() alg.seed(seed) for ep in range(FIG_12_11_N_EP): tot_steps += alg.pol_eva(None, 1, max_steps=FIG_12_11_MAX_STEPS)[0] steps_l.append(tot_steps / (FIG_12_11_N_RUNS * FIG_12_11_N_EP)) plt.plot(alpha_l, -np.array(steps_l), label=FIG_12_11_ALG_STR[alg_name]) xticks, yticks = np.linspace(0.2, 2, 10), np.linspace(-550, -150, 9) xnames = map(lambda x: str(x)[:3], xticks) left_title = ( f'Mountain Car\nReward per\nepisode\n(averaged \nover ' + f'first\n{FIG_12_11_N_EP} episodes\n{FIG_12_11_N_RUNS} runs)') plot_figure(ax, 'Figure 12.11', xticks, xnames, f'alpha * number of tilings ({N_TLGS})', yticks, yticks, left_title, labelpad=45) fig.set_size_inches(20, 14) plt.legend() save_plot('fig12.11', dpi=100) plt.show()
def fig_12_10(): fig, ax = plt.subplots() for lam in FIG_12_10_LAM_L: print(f"[LAM={lam}]") steps_l = [] alpha_l = np.linspace(FIG_12_10_ALP_MIN, FIG_12_10_ALP_MAX, FIG_12_10_N_PTS) for alpha in alpha_l: F, qhat = get_fn_mc(N_TIL, N_TLGS) alg = SarsaLam(MountainCar(), alpha / N_TLGS, N_TIL * N_TLGS, lam, F, qhat, FIG_12_10_EPS, FIG_12_10_G) print(f"[ALPHA={alg.a}]") tot_steps = 0 for seed in range(FIG_12_10_N_RUNS): print(f"[RUN #{seed}]") alg.reset() alg.seed(seed) for ep in range(FIG_12_10_N_EP): print(f"[EP #{ep}]") tot_steps += alg.pol_eva(None, 1, max_steps=FIG_12_10_MAX_STEPS)[0] steps_l.append(tot_steps / (FIG_12_10_N_RUNS * FIG_12_10_N_EP)) plt.plot(alpha_l, steps_l, label=f'lam={lam}') xticks, yticks = np.linspace(0.5, 1.5, 5), np.linspace(180, 300, 7) left_title = ( f'Mountain Car\nSteps per\nepisode\n(averaged \nover ' + f'first\n{FIG_12_10_N_EP} episodes\n{FIG_12_10_N_RUNS} runs)') plot_figure(ax, 'Figure 12.10', list(xticks) + [1.6], xticks, f'alpha * number of tilings ({N_TLGS})', [160] + list(yticks), yticks, left_title, labelpad=35) fig.set_size_inches(20, 14) plt.legend() save_plot('fig12.10', dpi=100) plt.show()
def fig_10_3(): fig, ax = plt.subplots() qhat, nab_qhat = get_qhats(N_TIL, N_TLGS) for (n, alp) in zip(FIG_10_3_N_L, FIG_10_3_ALP_L): print(f"[n={n}, alpha={alp}]") tot_n_steps = np.zeros(FIG_10_3_N_EP) for seed in range(FIG_10_3_N_RUNS): print(f"[RUN #{seed}]") alg = nStepSemiGradSarsa(MountainCar(), alp, N_TIL * N_TLGS, 0, n) alg.seed(seed) tot_n_steps += np.array( alg.pol_eva(None, qhat, nab_qhat, FIG_10_3_N_EP, FIG_10_3_G)) plt.plot(tot_n_steps / FIG_10_3_N_RUNS, label=f'n={n}') plt.yscale('log') xticks, yticks = [0, 500], [100, 200, 400, 1000] plot_figure(ax, 'Figure 10.3', xticks, xticks, 'Episode', yticks, yticks, 'Steps\nper episode\n(log scale)') fig.set_size_inches(20, 14) plt.legend() save_plot('fig10.3', dpi=100) plt.show()
def fig_10_1(): def plot_and_save(filename, title, alg, n_ep, max_steps=np.inf): fig = plt.figure() alg.pol_eva(qhat, nab_qhat, n_ep, FIG_10_1_G, max_steps=max_steps) print_qhat_mc(alg, fig, '111', title) fig.set_size_inches(20, 14) save_plot(filename, dpi=100) plt.show() qhat, nab_qhat = get_qhats(N_TIL, N_TLGS) env = MountainCar() alg = EpisodicSemiGradientTD0(env, FIG_10_1_ALP, N_TIL * N_TLGS, eps=0) alg.seed(0) plot_and_save(f'fig10.1_{FIG_10_1_STEPS}_steps', f'Step {FIG_10_1_STEPS}', alg, 1, FIG_10_1_STEPS) tot_ep = 1 for ep in FIG_10_1_EP_L: alg.pol_eva(qhat, nab_qhat, ep - tot_ep, FIG_10_2_G) plot_and_save(f'fig10.1_{ep}_episodes', f'Episode {ep}', alg, ep - tot_ep) tot_ep += (ep - tot_ep)
def main(): env = MountainCar(mass=0.2, friction=0.3, delta_t=0.1) # Define the state arrays for velocity and position tot_action = 3 # Three possible actions tot_bins = 12 # the value used to discretize the space velocity_state_array = np.linspace(-1.5, +1.5, num=tot_bins - 1, endpoint=False) position_state_array = np.linspace(-1.2, +0.5, num=tot_bins - 1, endpoint=False) # Random policy as a square matrix of size (tot_bins x tot_bins) # Three possible actions represented by three integers policy_matrix = np.random.randint(low=0, high=tot_action, size=(tot_bins, tot_bins)).astype(np.float32) print("Policy Matrix:") print(policy_matrix) # The state-action matrix and the visit counter # The rows are the velocities and the columns the positions. state_action_matrix = np.zeros((tot_action, tot_bins * tot_bins)) visit_counter_matrix = np.zeros((tot_action, tot_bins * tot_bins)) # Variables gamma = 0.999 alpha = 0.001 tot_episode = 100000 epsilon_start = 0.9 # those are the values for epsilon decay epsilon_stop = 0.1 epsilon_decay_step = 3000 print_episode = 500 # print every... movie_episode = 10000 # movie saved every... reward_list = list() step_list = list() for episode in range(tot_episode): epsilon = return_decayed_value(epsilon_start, epsilon_stop, episode, decay_step=epsilon_decay_step) # Reset and return the first observation observation = env.reset(exploring_starts=False) # The observation is digitized, meaning that an integer corresponding # to the bin where the raw float belongs is obtained and use as replacement. observation = (np.digitize(observation[1], velocity_state_array), np.digitize(observation[0], position_state_array)) is_starting = True cumulated_reward = 0 for step in range(100): #Take the action from the action matrix #action = policy_matrix[observation[0], observation[1]] #Take the action using epsilon-greedy action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=epsilon) if (is_starting): action = np.random.randint(0, tot_action) is_starting = False #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) new_observation = (np.digitize(new_observation[1], velocity_state_array), np.digitize(new_observation[0], position_state_array)) new_action = policy_matrix[new_observation[0], new_observation[1]] #Updating the state-action matrix state_action_matrix = update_state_action( state_action_matrix, visit_counter_matrix, observation, new_observation, action, new_action, reward, alpha, gamma) #Updating the policy policy_matrix = update_policy(policy_matrix, state_action_matrix, observation) #Increment the visit counter visit_counter_matrix = update_visit_counter( visit_counter_matrix, observation, action) observation = new_observation cumulated_reward += reward if done: break # Store the data for statistics reward_list.append(cumulated_reward) step_list.append(step) # Printing utilities if (episode % print_episode == 0): print("") print("Episode: " + str(episode + 1)) print("Epsilon: " + str(epsilon)) print("Episode steps: " + str(step + 1)) print("Cumulated Reward: " + str(cumulated_reward)) print("Policy matrix: ") print_policy(policy_matrix) if (episode % movie_episode == 0): print("Saving the reward plot in: ./reward.png") plot_curve(reward_list, filepath="./reward.png", x_label="Episode", y_label="Reward", x_range=(0, len(reward_list)), y_range=(-1.1, 1.1), color="red", kernel_size=500, alpha=0.4, grid=True) print("Saving the step plot in: ./step.png") plot_curve(step_list, filepath="./step.png", x_label="Episode", y_label="Steps", x_range=(0, len(step_list)), y_range=(-0.1, 100), color="blue", kernel_size=500, alpha=0.4, grid=True) print("Saving the gif in: ./mountain_car.gif") env.render(file_path='./mountain_car.gif', mode='gif') print("Complete!") # Save reward and steps in npz file for later use # np.savez("./statistics.npz", reward=np.asarray(reward_list), step=np.asarray(step_list)) # Time to check the utility matrix obtained print("Policy matrix after " + str(tot_episode) + " episodes:") print_policy(policy_matrix)
DECAY = 0.99 ALPHA = 0.9 BIN_NUM = 64 train_times = 31250 vis_times = 10 output_path = 'output/mountain_car_sarsa/' if not os.path.exists(output_path): os.makedirs(output_path) def random_action(_, n): return np.random.randint(n) bin_shape = (BIN_NUM, BIN_NUM) games = MountainCar(PARA_SIZE, REPLAY_SIZE) inc_learner = BinIncLearner(Discretization(state_bounds, bin_shape), bin_shape, (action_n, ), ALPHA) framework = SARSA(games, inc_learner, random_action, DECAY) for v in xrange(vis_times): framework.loop(train_times) visualize_states(output_path + 'states_%i.png' % v, inc_learner.eval_batch_no_default) games.close()
# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from mountain_car import MountainCar import random my_car = MountainCar(mass=0.2, friction=0.3, delta_t=0.1) cumulated_reward = 0 print("Starting random agent...") for step in range(100): action = random.randint(a=0, b=2) observation, reward, done = my_car.step(action) cumulated_reward += reward if done: break print("Finished after: " + str(step + 1) + " steps") print("Cumulated Reward: " + str(cumulated_reward)) print("Saving the gif in: ./mountain_car.gif") my_car.render(file_path='./mountain_car.gif', mode='gif') print("Complete!")
import argparse import os from mountain_car import MountainCar import matplotlib.pyplot as plt ENV_DICT = { 'mountain_car': MountainCar(mnt=False), } def play(env): def refresh(): os.system('cls' if os.name == 'nt' else 'clear') print(env) while True: env.reset() done = False v = [] while not done: key = '' while key not in env.keys: refresh() key = input("press key\n$>") if key == "exit()": exit() if (key == 'p'): env.show(n_pts=10000) if (key == 'v'): plt.plot(v) plt.show()
"""The world's simplest agent!""" def __init__(self, action_space): self.action_space = action_space def act(self, observation, reward, done): return self.action_space.sample() if __name__ == '__main__': logging.basicConfig() log = logging.getLogger("mountain-car") log.setLevel(level='INFO') # we will use our environment (wrapper of OpenAI env) mountain_car = MountainCar() # specify which agent you want to use, # BonsaiAgent that uses trained Brain or # RandomAgent that randomly selects next action agent = BonsaiAgent() episode_count = 100 try: for i in range(episode_count): #start a new episode and get the new state mountain_car.episode_start() state = mountain_car.get_state() while True: