def see_path(): ''' show the path from q_table :return: ''' q_table = read_table() env = Maze() agent = DQN_Agent(actions=list(range(env.n_actions))) agent.load_q_table(q_table) s = env.reset() while True: env.render() a = agent.get_path(s) # print(a) time.sleep(0.2) s_, r, done = env.step(a) s = s_ if done: env.render() break
def common_check(episodes=400): ''' an ordinary learning process, and store q_table :return: ''' env = Maze() agent = DQN_Agent(actions=list(range(env.n_actions))) for episode in range(episodes): s = env.reset() episode_reward = 0 while True: #env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. a = agent.choose_action(s) s_, r, done = env.step(a) q_table = agent.update_q(s, s_, a, r) episode_reward += r s = s_ if done: #env.render() break print('episode:', episode, 'episode_reward:', episode_reward) store_table(q_table)
def check_converge_time(): ''' to show how many episodes needed to find the optimal path for the first time :return: ''' env = Maze() cvg_time = 0 for i in range(100): print(i) agent = DQN_Agent(actions=list(range(env.n_actions))) flag = 0 for episode in range(300): if flag: break s = env.reset() episode_reward = 0 while True: #env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. a = agent.choose_action(s) s_, r, done = env.step(a) q_table = agent.update_q(s, s_, a, r) episode_reward += r if episode_reward == 4: cvg_time += episode flag = 1 s = s_ #print(s) if done: #env.render() #time.sleep(0.5) break # print('episode:', episode, 'episode_reward:', episode_reward) if flag == 0: cvg_time += 300 print(cvg_time / 100)
episodes = 100 model_based_episodes = 5 env = Maze() model = Model(actions=list(range(env.n_actions))) agent = Agent(actions=list(range( env.n_actions))) # 从range(4),也就是0,1,2,3(上下右左)四个行为中选择 for episode in range(episodes): # 对于每一段,从开始到结束 s = env.reset() episode_reward = 0 while True: #env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. # move one step a = agent.choose_action(str(s)) s_, r, done = env.step(a) # update Q model-free agent.learn(str(s), a, r, str(s_), done) model.store_transition(str(s), a, r, s_) # update Q model-based for n in range(model_based_episodes): ss, sa = model.sample_s_a() sr, ss_ = model.get_r_s_(ss, sa) agent.learn(ss, sa, sr, str(ss_), done) episode_reward += r s = s_
if __name__ == "__main__": ### START CODE HERE ### # This is an agent with random policy. You can learn how to interact with the environment through the code below. # Then you can delete it and write your own code. env = Maze() training_epoch = 100 if maze == '1' else 1000 agent = D_Q_Agent(training_epoch) for episode in range(training_epoch): agent.if_rewarded = False s = env.reset() while True: # env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. chosen_direction = agent.choose_action(s, episode) s_, r, done = env.step(chosen_direction) agent.update_Q_value(s, chosen_direction, r) if s_[-1]: agent.if_rewarded = True agent.if_rewarded_in_the_whole_training = True s = s_ agent.simulative_training(100) if done: #env.render() time.sleep(0.5) break print('episode:', episode) print('Training Finished! Now Demonstrate the Optimal Policy:') while True: