def example_6_6(): fig, ax = plt.subplots() fig.suptitle(f'Example 6.6 (Averaged over {EX_6_6_N_SEEDS} seeds)') ax.set_xlabel('Episodes') ax.set_ylabel( f'(Average of last {EX_6_6_N_AVG}) sum of rewards during episodes') ax.set_yticks(EX_6_6_YTICKS) ax.set_ylim(bottom=min(EX_6_6_YTICKS)) n_ep = EX_6_6_N_EPS env = TheCliff() qlearning_alg = QLearning(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) sarsa_alg = Sarsa(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) qlearning_rew = np.zeros(n_ep) sarsa_rew = np.zeros(n_ep) for seed in range(EX_6_6_N_SEEDS): print(f"seed={seed}") qlearning_alg.seed(seed) qlearning_rew += qlearning_alg.q_learning(n_ep) sarsa_alg.seed(seed) sarsa_rew += sarsa_alg.on_policy_td_control(n_ep, rews=True) plt.plot(smooth_rewards(qlearning_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG), color='r', label='Q learning') plt.plot(smooth_rewards(sarsa_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG), color='b', label='Sarsa') plt.legend() plt.savefig('example6.6.png') plt.show()
def run_sarsa(start, goal, Xrange, Vrange, plot_data_pid): sarsa_plot_data = list() sarsa_plot_data.append(plot_data_pid) for i in range(1, 9): sarsa = Sarsa(start, goal, Xrange, Vrange, n=i) sarsa.train(epoch=EPOCH, max_episode_length=MAX_EPISODE_LENGTH) sarsa_plot_data.append(sarsa.episodes) plot_with_n(sarsa_plot_data)
def plot_sarsa(ax, n_ep, label=None, diags=False, stay=False, stoch=False, seed=0): env = WindyGridworld(diags, stay, stoch) alg = Sarsa(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) alg.seed(seed) kwargs = {"label": label} if label else {} plt.plot(alg.on_policy_td_control(n_ep), **kwargs)
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False): """ Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters :param algorithm: String :param track: List :param x_start: Int :param y_start: Int :param discount: Float :param learning_rate: Float :param threshold: Float :param max_iterations: Int :param epsilon: Float :param reset_on_crash: Boolean :return: None """ with open(track) as f: specs = f.readline().strip().split(',') rows = int(specs[0]) cols = int(specs[1]) layout = f.read().splitlines() initial_state = (x_start, y_start, 0, 0) initial_action = (0, 0) agent = Car(initial_action, epsilon) environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash) if algorithm == 'value_iteration': value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent) value_iterator.run() path = value_iterator.extract_policy(initial_state) value_iterator.plot_max_diffs() elif algorithm == 'q_learning': q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent) path = q_learner.run() q_learner.plot_avg_cost() elif algorithm == 'sarsa': sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent) path = sarsa.run() sarsa.plot_avg_cost() else: print("No algorithm selected") return None draw_track(path, layout)
def main(minutes): logging.info('training started for {} minutes'.format(minutes)) logging.info('max iterations: {}'.format(MAX_ITERATIONS)) # q = loadQ(currency, interval) rewards = [] errors = [] ticks = [] start_time = time.time() while (time.time() - start_time) < (minutes * 60): with Sarsa(MODEL_FILENAME) as sarsa: logging.info('sarsa execution') # q, r, error, tick = train(df_inner, q, alpha, epsilon, PERIODS, ACTIONS, pip_mul, info['std']) break
def main(n_iters=3000, n_games_per_update=10, n_max_steps=1000, n_bins=50): env = gym.make('CartPole-v0') model = Sarsa(actions=range(2), alpha=0.1, gamma=0.95, epsilon=0.5) cart_p_bins = pd.cut([-2.4, 2.4], bins=n_bins, retbins=True)[1][1:-1] cart_v_bins = pd.cut([-2.0, 2.0], bins=n_bins, retbins=True)[1][1:-1] pole_a_bins = pd.cut( [-math.radians(41.8), math.radians(41.8)], bins=n_bins, retbins=True)[1][1:-1] pole_v_bins = pd.cut([-3.0, 3.0], bins=n_bins, retbins=True)[1][1:-1] bins = (cart_p_bins, cart_v_bins, pole_a_bins, pole_v_bins) # training for iter in range(n_iters): finished_steps = [] for game in range(n_games_per_update): obs = env.reset() state = build_state(obs, bins) action = model.choose_action(state) for step in range(n_max_steps): obs, reward, done, info = env.step(action) next_state = build_state(obs, bins) next_action = model.choose_action(next_state) model.update_q(state, action, reward, next_state, next_action) state = next_state action = next_action if done: finished_steps.append(step) break print("[%d / %d]: %.1f" % (iter, n_iters, (sum(finished_steps) / len(finished_steps)))) # testing obs = env.reset() state = build_state(obs, bins) done = False count = 0 while not done: env.render() action = model.choose_action(state, training=False) obs, reward, done, info = env.step(action) state = build_state(obs, bins) count += 1 print(count)
class Windy4(Windy): 'Exercicio 6.9 2a parte' def __wind__(self, col: int): w = Windy.__wind__(self, col) p = random.choice((0, 1, 2)) if p == 1: # uma casa acima w += 1 elif p == 2: # uma casa abaixo w -= 1 return w if __name__ == '__main__': model = Windy() p = Sarsa(model, alfa=0.5) episode = 0 total = 0 while episode < 1000: steps = p.estimate(model.start) total += steps episode += 1 # print(episode, steps, total) s = model.start for s in model.episode(s): m = s.pi.n print('%s %s' % (s.n, m.name)) # for n,s in model.states.items(): # print('%s %s' % (n, s.pi.n))
import numpy as np import gym import matplotlib.pyplot as plt from collections import deque from sarsa import Sarsa N_EPISODES = 20 env = gym.make('Taxi-v3') print("Number of States = {}".format(env.nS)) print("Number of Actions = {}".format(env.nA)) current_state = env.reset() q = np.load("q-agent.npy") td_agent = Sarsa(env.nS, env.nA, env) td_agent.q = q scores_window = deque(maxlen=10) for i_episode in range(N_EPISODES): current_state = env.reset() done = False episode_reward = 0 while not done: next_state, reward, done, _ = env.step( np.argmax(td_agent.q[current_state][:])) episode_reward += reward current_state = next_state env.render() print() episode_reward += reward scores_window.append(episode_reward)
sarsa_table = RL.learn(str(state), action, reward, str(state_), action_) # 进入下一状态 state = state_ action = action_ # 结束此回合 if done: break # end of game print('game over') print(sarsa_table) env.destroy() if __name__ == "__main__": # 创建游戏世界 env = Maze() # 创建q表,初始化动作空间 0 1 2 3 RL = Sarsa(actions=list(range(env.n_actions))) # Call function once after 100ms env.after(100, update) env.mainloop()
# avg = np.average(np.array(rewards), axis=0) # std = np.std(np.array(rewards), axis=0) # maximumEpisodes = avg.shape[0] # plt.errorbar(np.array([i for i in range(maximumEpisodes)]), avg, std, marker='^', ecolor='g') # plt.show() type = "linear" # best parameter, order 3, e 0.2, alpha 0.5 # best parameter, order 5, e 0.2, alpha 0.5 for e in [0.3]:#, 0.1, 0.01, 0.3, 0.4]: for order in [3]: #, 5]: for alpha in [0.01]:#, 0.0001, 0.0005, 0.0009, 0.001, 0.005, 0.009, 0.01, 0.05, 0.09, 0.1, 0.5, 0.9]: rewards = [] print("Alpha: ", alpha) for t in tqdm(range(trails)): # print("Alpha: %s, Trail: %s" %(alpha, t)) td = Sarsa(gamma, alpha, env, state_space, steps, e, plot=plot, order=order, discount=discount) td.train(episodes) rewards.append(td.reward) avg = np.average(np.array(rewards), axis=0) std = np.std(np.array(rewards), axis=0) maximumEpisodes = avg.shape[0] plt.errorbar(np.array([i for i in range(maximumEpisodes)]), avg, std, marker='^', ecolor='g') #name = "Sarsa/figures/%s/cartPole_type_%s_order%s_alpha%s_e%s.jpg" %(type, type, order, alpha, e) name = "Grid_alpha%s_e%s.jpg" % (alpha, e) pickle.dump(avg, open(name, "wb")) plt.xlabel("Number of episodes") plt.ylabel("Total Reward") # plt.savefig(name) # plt.close()
} for solver_name, solver_fn in mdp_solvers.items(): print('Final result of {}:'.format(solver_name)) policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5) print(policy_grids[:, :, -1]) print(utility_grids[:, :, -1]) plt.figure() gw.plot_policy(utility_grids[:, :, -1]) plot_convergence(utility_grids, policy_grids) plt.show() sa = Sarsa(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 ### IMPORTANT # you do need to write your own generate_experience function # based on either \epilon greedy or exploration function # make sure on your submission, you need to submit # a new rl_qlearn.py that has the updated gw.generate_experience_your_name ### IMPORTANT flat_policies, flat_utilities = sa.learn(start_state, gw.generate_experience,
import numpy as np import helper from sarsa import Sarsa from qlearning import QLearning from sarsa_expected import SarsaExpected # File to run in order to generate all the plots sequentially if __name__ == '__main__': data_X = np.arange(start=0, stop=10000, step=100) # Agent 1: Sarsa(0) data_Y1 = np.zeros((100, 1)) for seed in range(50): print(f'Seed: {seed}') sarsa = Sarsa(seed=seed, num_actions=4, alpha=0.1) y = sarsa.run() data_Y1 += y data_Y1 /= 10 helper.plotSingle(data_X, data_Y1, "Sarsa(0)") # Sarsa(0) with King's move data_Y2 = np.zeros((100, 1)) for seed in range(50): print(f'Seed: {seed}') sarsa = Sarsa(seed=seed, num_actions=8, alpha=0.1) y = sarsa.run() data_Y2 += y data_Y2 /= 10 helper.plotSingle(data_X, data_Y2, "Sarsa(0) with King's move")
# Create the grid World Env env = gridWorldEnv(easy) # Define the Max Epochs and Each Episode Length maxEpisode = 5000 epLength = 50 # epsilon greedy Exploration Value epsilon = 0.3 epsilonDecay = 0.999 # Decay parameter for epsilon # State and Action Dimension a_dim = env.action_space_size() s_dim = env.observation_space_size() # Initialize the Learning Agent agent = Sarsa(s_dim, a_dim) # Reward vector for Ploting epReward = [] avgReward = [] # File Name for saving the Results to file file_name = 'hw2_sarsa_easyon' #Start Learning for epochs in range(maxEpisode): state = env.reset() total_reward = 0 for h in range(epLength): action = agent.eGreedyAction(state, epsilon)
def digitize_fun(state): def bins(clip_min, clip_max, num): return np.linspace(clip_min, clip_max, num + 1)[1:-1] car_pos, car_v, pole_angle, pole_v = state result = [ np.digitize(car_pos, bins(-2.4, 2.4, 4)), np.digitize(car_v, bins(-3.0, 3.0, 4)), np.digitize(pole_angle, bins(-0.5, 0.5, 4)), np.digitize(pole_v, bins(-2.0, 2.0, 4)) ] x = sum([x * (4**i) for i, x in enumerate(result)]) return x q_f = Sarsa(digitize_fun, 0.2, 0.99, 0.15, [0, 1]) max_number_of_steps = 200 # 每一场游戏的最高得分 goal_average_steps = 195 num_consecutive_iterations = 100 last_time_steps = np.zeros( num_consecutive_iterations) # 只存储最近100场的得分(可以理解为是一个容量为100的栈) env = gym.make('CartPole-v0') for episode in range(5000): observation = env.reset() # 初始化本场游戏的环境 episode_reward = 0 action = q_f.get_actions(observation) next_action = action for t in range(max_number_of_steps):
# Show the food. for f in food: pylab.annotate('food', xy=f, size=5, bbox=dict(boxstyle="round4,pad=.5", fc="0.8"), ha='center') for i in range(len(path) - 1): pylab.arrow(path[i][0], path[i][1], path[i+1][0] - path[i][0], path[i+1][1] - path[i][1]) # Parameters. max_size = 20 food = [(0,8), (4,4), (1,1), (8,8), (6,2), (12, 15), (17,2), (4, 12), (17, 17), (12, 1)] # Start the algorithm. sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=0.1, alpha=0.1, gamma=0.2) sarsa.seed(int(100* time.time())) plot_in = [10, 100, 200, 400, 600, 1000, 1500, 2000, 4000, 5000, 6000, 8000, 10000, 12000, 15000, 20000] for i in range(max(plot_in) + 1): sarsa.iterate() if i % 10 == 0: print i if i in plot_in: plot_path([s.position for s in sarsa.history]) pylab.savefig('/tmp/simple-path-4-%d.png' % i) print i
def play(agentType="qlearning", worldNumber=0, eps=0.1, alpha=0.001, gamma=0.999): # env.action_space : ens des actions possibles # env.action_space.n : nombre d'actions possibles # env.observation_space : ens des états possibles # env.observation_space : nombre d'états possibles env = gym.make("gridworld-v0") # Init un environnement # setPlan(arg1, arg2) # arg1 : fichier de la carte à charger # arg2 : liste de récompenses associées aux différents types de cases du jeu env.setPlan("gridworldPlans/plan" + str(worldNumber) + ".txt", { 0: -0.001, 3: 1, 4: 1, 5: -1, 6: -1 }) env.verbose = True if agentType == "qlearning": agent = Q_Learning(env, eps, alpha, gamma) elif agentType == "sarsa": agent = Sarsa(env, eps, alpha, gamma) elif agentType == "dynaq": agent = Dyna_Q(env, eps, alpha, gamma) else: agent = Q_Learning(env, eps, alpha, gamma) print("Agent inconnu : qlearning par défaut") # Faire un fichier de log sur plusieurs scenarios outdir = 'gridworld-v0/random-agent-results' envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False) #countActions = [] countRewards = [] episode_count = 2000 reward = 0 done = False rsum = 0 FPS = 0.001 for i in tqdm(range(episode_count)): obs = envm.reset() env.verbose = (i % 100 == 0 and i > 0) # afficher 1 episode sur 100 if env.verbose: env.render(FPS) env.render(mode="human") j = 0 rsum = 0 while True: action = agent.action(obs, reward) obs, reward, done, _ = envm.step(action) rsum += reward j += 1 if env.verbose: env.render(FPS) if done: print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions") #countActions.append(j) countRewards.append(rsum) break np.save( "rewards_gridworld_" + str(worldNumber) + "_" + agentType + "_alpha0_1.npy", countRewards) print("Mean & std : ", np.mean(countRewards), np.std(countRewards)) print("Reward cum : ", np.sum(countRewards)) print("done") env.close() return countRewards
if __name__ == '__main__': args = get_cmd_args() alpha = args.learning_rate gamma = args.discount_rate epsilon = args.greedy_rate actions_number = args.actions_number gridworld_height = args.gridworld_height gridworld_width = args.gridworld_width episode_number = args.episode_number background_introduction = ''' ----------- Windy Gridworld with King's Moves ----------- 1. Learning Rate: \033[1;31m%.2f\033[0m 2. Discount Rate: \033[1;31m%.2f\033[0m 3. Greedy Rate: \033[1;31m%.2f\033[0m 4. Action Number: \033[1;31m%d\033[0m 5. Episode Number: \033[1;31m%d\033[0m ''' % (alpha, gamma, epsilon, actions_number, episode_number) print(background_introduction) sarsa = Sarsa(alpha, gamma, epsilon, actions_number, gridworld_height, gridworld_width, episode_number) sarsa.sarsa()
for n in range(1, number_of_scenarios + 1): # Randomly locate the food on the barn. amount_food = randint(max_size / 2, 2 * max_size) food = [] while len(food) < amount_food: # Add a new piece of food. food.append((randint(0, max_size-1), randint(0, max_size-1))) # Ensure uniqueness. food = list(set(food)) # Start the algorithm. sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=epsilon, alpha=alpha, gamma=gamma) sarsa.seed(int(100 * time.time())) # keep track of how much do we move the q. track = [] for it in range(1, max_iters + 1): if it % 10 == 0: print "Scenario %d: %d/%d\r" % (n, it, max_iters) , sys.stdout.flush() history, corrections = sarsa.iterate() track.append(numpy.sqrt(sum(map(lambda x: x*x, corrections)))) # We're just selecting nice places to evaluate the current policy and create a picture.