observation = observation_ # 如果终止, 就跳出循环 if done: steps.append(step) episodes.append(episode) break step += 1 # 总步数 # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = DoubleDQN(env.n_actions, env.n_features,#observation/state 的属性,如长宽高 learning_rate=0.01, reward_decay=0.9, dueling=True, e_greedy=0.9, replace_target_iter=200, # 每 200 步替换一次 target_net 的参数 memory_size=2000, # 记忆上限 # output_graph=True # 是否输出 tensorboard 文件 ) env.after(100, run_maze)#进行强化学习训练 env.mainloop() # 观看训练时间曲线 his_double = np.vstack((episodes, steps))
from agent import Agent, D_Q_Agent import time maze = '2' if maze == '1': from maze_env1 import Maze elif maze == '2': from maze_env2 import Maze if __name__ == "__main__": ### START CODE HERE ### # This is an agent with random policy. You can learn how to interact with the environment through the code below. # Then you can delete it and write your own code. env = Maze() training_epoch = 100 if maze == '1' else 1000 agent = D_Q_Agent(training_epoch) for episode in range(training_epoch): agent.if_rewarded = False s = env.reset() while True: # env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. chosen_direction = agent.choose_action(s, episode) s_, r, done = env.step(chosen_direction) agent.update_Q_value(s, chosen_direction, r) if s_[-1]: agent.if_rewarded = True agent.if_rewarded_in_the_whole_training = True s = s_