You will see the sarsa is more coward when punishment is close because it cares about all behaviours, while q learning is more brave because it only cares about maximum behaviour. """ import sys import os sys.path.append(os.getcwd()) import pandas as pd import time from maze_env import Maze from RL_brain import SarsaLambdaTable if __name__ == '__main__': env = Maze() RL = SarsaLambdaTable(actions=env.action_space) def update(): log = [] for episode in range(100): s = env.reset() a = RL.choose_action(str(s)) RL.eligibility_trace *= 0 # 清空路径 step_count = 0 done = False r = 0 while not done: env.render() s_, r, done = env.step(a) a_ = RL.choose_action(str(s_))
# RL take action and get next observation and reward observation_, reward, done = env.step( action) # 获取当前动作后的环境和reward情况 # RL choose action based on next observation action_ = RL.choose_action(str(observation_)) # 根据环境选择下一个动作 # RL learn from this transition (s, a, r, s, a) ==> Sarsa RL.learn(str(observation), action, reward, str(observation_), action_) # swap observation and action observation = observation_ action = action_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = SarsaLambdaTable(actions=list(range(env.n_actions))) env.after(100, update) env.mainloop()
# RL choose action based on next observation action_ = RL.choose_action(str(observation_)) # RL learn from this transition (s, a, r, s, a) ==> Sarsa RL.learn(str(observation), action, reward, str(observation_), action_) # swap observation and action observation = observation_ action = action_ # break while loop when end of this episode if done: #保存q_table时使用 if RL.read_save: RL.save_q_table() break # end of game print('game over') print("Q_table:\n", RL.q_table) env.destroy() if __name__ == "__main__": env = Maze() RL = SarsaLambdaTable(actions=list(range(env.n_actions)), read_save=read_save) env.after(100, update) env.mainloop()