def update(): for episode in range(Max_Episodes): # Initial oberservation observation = env.reset() while True: # Fresh env env.render() # RL choose actions action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = env.step(action) # Save the transition from S to S' RL.learn(str(observation), action, reward, str(observation_)) # Swap observation observation = observation_ if done: break print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions = list(range(env.n_actions))) env.after(100, update) env.mainloop()
RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
action = rl.choose_action(state_str, avaliable_actions) #选择动作 reward, state_next, is_success = maze.step(action) #获取环境奖励 state_next_str_arr = np.array(state_next) / maze.unit state_next_str = str(state_next_str_arr.tolist()) rl.update(state_str, action, state_next_str, reward, is_success, steps) #更新状态 steps.append([state_str, action]) #存储经历,后续学习 maze.render() maze.write_weight(rl.state_table) print('run end {0}'.format(j)) print(rl.state_table.round(2)) time.sleep(0.5) # maze.write_weight(rl.state_table) #进行强制更新,更新全部路径的权重多次训练之后就能够得到优秀权重 # rl.forceUpdate(steps,reward/10) print("forceUpdate") # print(rl.state_table.round(2)) if __name__ == '__main__': maze = Maze(8, 6, chif=15) rl = RLbrain(maze.get_all_action()) #获取所有动作 maze.after(100, render) maze.mainloop()
for episode in range(100): # initial observation s_curr = maze._reset_maze() while True: maze._render() # Get next action from the Q-table action = rl.select_next_action(str(s_curr)) # take the action and observe the next state and reward s_next, reward, isDone = maze._update_maze(action) # learn from the feedback rl.learn(str(s_curr), action, reward, str(s_next)) s_curr = s_next if isDone: break print("Game over") maze.destroy() if __name__ == "__main__": maze = Maze() rl = QLearningTable(actions=list(range(len(maze.action_space)))) maze.after(100, update) maze.mainloop()
s = env.reset() while True: env.render() a = RL.choose_action(str(s)) s_, r, done = env.step(a) RL.learn(str(s), a, r, str(s_)) # use a model to output (r, s_) by inputting (s, a) # the model in dyna Q version is just like a memory replay buffer env_model.store_transition(str(s), a, r, s_) for n in range(10): # learn 10 more times using the env_model ms, ma = env_model.sample_s_a() # ms in here is a str mr, ms_ = env_model.get_r_s_(ms, ma) RL.learn(ms, ma, mr, str(ms_)) s = s_ if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) env_model = EnvModel(actions=list(range(env.n_actions))) env.after(0, update) env.mainloop()
observation = env.reset() while True: # Fresh env env.render() # RL choose actions action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = env.step(action) # Save the transition from S to S' RL.learn(str(observation), action, reward, str(observation_)) # Swap observation observation = observation_ if done: break print('game over') env.destroy() if __name__ == "__main__": env = Maze() if Algorithm == 'SarsaLambda': RL = SarsaLambda(action_space=list(range(env.n_actions))) env.after(100, update_Sarsa) elif Algorithm == 'Q_learning': RL = Q_learning(action_space=list(range(env.n_actions))) env.after(100, update) elif Algorithm == 'Sarsa': RL = Sarsa(action_space=list(range(env.n_actions))) env.after(100, update_Sarsa) env.mainloop()