while True: # fresh env env.render() # RL take action and get next observation and reward observation_, reward, done = env.step(action) # RL choose action based on next observation action_ = RL.choose_action(str(observation_)) #选择下一个动作 # RL learn from this transition (s, a, r, s, a) ==> Sarsa RL.learn(str(observation), action, reward, str(observation_), action_) # swap observation and action observation = observation_ #状态为下一个状态 action = action_ #动作为下一个动作(这个很重要,表示他一个亲历亲为的on-policy) # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = SarsaTable(actions=list(range(env.n_actions))) #给定一个sarsa算法 env.after(100, update) env.mainloop()
# break while loop when end of this episode if done: print('Episode:', episode + 1, ' Reward: %i' % int(rewards)) print('Policy:', policy) break rewards_20.append(rewards) policy_20.append(policy) print('Best Reward:', np.max(rewards_20)) print('Best Policy:', policy_20[np.argmax(rewards_20)]) x = list(range(len(rewards_20))) plt.plot(x, rewards_20) #plt.title(f'Sarsa Result action_space: {action_space} learn_rate: {learning_rate} reward_decay: {reward_decay} e_greedy: {e_greedy}') plt.title('Sarsa Result') plt.xlabel('Episodes') plt.ylabel('Rewards') plt.show() if __name__ == "__main__": envSeqDec = ChallengeSeqDecEnvironment(experimentCount=20000) action_space = [1, 2, 3, 4] # Using two actions can get steady 490 - 500 result. RL = SarsaTable(actions=action_space, learning_rate=0.05, reward_decay=0.5, e_greedy=0.9) generate()
# 探索者在环境中实施这个 action, 并得到环境返回的下一个 state 观测值, reward 和 done (是否是掉下地狱或者升上天堂) observation_, reward, done = env.step(action) # # 根据下一个 state (obervation_) 选取下一个 action_ action_ = RL.choose_action(str(observation_)) # RL 从这个序列 (state, action, reward, state_) 中学习, # # 多了一个action_参数 RL.learn(str(observation), action, reward, str(observation_), action_) observation = observation_ # 将下一个 state 的值传到下一次循环 # # 下一个action_的值,变成action action = action_ if done: # 如果掉下地狱或者升上天堂, 这回合就结束了 break print('game over') # 结束游戏并关闭窗口 env.destroy() if __name__ == "__main__": # 定义环境 env 和 RL 方式 env = Maze() RL = SarsaTable(actions=list(range(env.n_actions))) # # 创建SaraTable类 # 开始可视化环境 env env.after(100, update) env.mainloop()
You will see the sarsa is more coward when punishment is close because it cares about all behaviours, while q learning is more brave because it only cares about maximum behaviour. """ import sys import os sys.path.append(os.getcwd()) import pandas as pd import time from maze_env import Maze from RL_brain import SarsaTable if __name__ == '__main__': env = Maze() RL = SarsaTable(actions=env.action_space) def update(): log = [] for episode in range(100): s = env.reset() a = RL.choose_action(str(s)) step_count = 0 done = False r = 0 while not done: env.render() s_, r, done = env.step(a) a_ = RL.choose_action(str(s_)) RL.learn(str(s), a, r, str(s_), a_)
from maze_env import Maze from RL_brain import SarsaTable def update(): for episode in range(100): observation = env.reset() action = RL.choose_action(str(observation)) while True: env.render() observation_,reward,done = env.step(action) action_ = RL.choose_action(str(observation_)) RL.learn(str(observation),action,reward,str(observation_),action_) observation = observation_ action = action_ if done: break print('game over') env.destroy() if __name__ == '__main__': env = Maze() RL = SarsaTable(action_space=list(range(env.n_actions))) env.after(100,update) env.mainloop()