def train_hdqnm(seed, file): # maze game MAZE_H=3 MAZE_W=3 hell_coord = [0,2] door_coord = [2,2] oval_coord = [1,1] np.random.seed(seed) tf.set_random_seed(seed) # maze game env = Maze(MAZE_H, MAZE_W, hell_coord, door_coord, oval_coord) n_goals = 3 max_episode = 10000 controller_start = 200 meta_controller_start = 10000 controller = DeepQNetwork(env.n_actions, env.n_features + 1, 'controller', optimizer='rmsprop', momentum=0.9, learning_rate=1e-3, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=5e3, replace_target_iter=200, memory_size=5000, output_graph=False, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e5, prioritized_replay_eps=1e-6 ) meta_controller = DeepQNetwork(n_goals, env.n_features, 'meta_controller', optimizer='rmsprop', momentum=0.9, learning_rate=1e-3, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=1e3, replace_target_iter=200, memory_size=500, output_graph=False, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e3, prioritized_replay_eps=1e-6, ) def play_maze(): def goal_reached(g, s): return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0)) s = env.reset() g = meta_controller.choose_action(s, test=True) score = 0 while True: env.render() a = controller.choose_action(np.hstack((s, g)), test=True) s_, r, done = env.step(a) score += r if done: break s = s_ if goal_reached(g, s): g = meta_controller.choose_action(s, test=True) return score def run_maze(max_episode): def goal_reached(g, s): return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0)) def goal_distance(g, s): return np.sqrt(s[2 * g + 1] ** 2 + s[2 * g + 2] ** 2) def reward(g, s_, done): # return 1+1/episode_step if goal_reached(g, s_) else 0 # if done and not goal_reached(g, s_): return -1 return 1 if goal_reached(g, s_) else -1 #return -goal_distance(g, s_) #if goal_reached(g, s_): # return 1 # else: # return -1 # for i in range(n_goals): # if goal_reached(i, s_): # if i == g: return 1 # else: return -1 # return 0 step = 0 score_list = [] avescore_list = [] testscore_list = [] flag = [False, False, False, False] for episode in range(max_episode): # initial observation s = env.reset() score = 0 g = meta_controller.choose_action(s) Done = False while True: F = 0 s0 = s episode_step = 1 while True: env.render() a = controller.choose_action(np.hstack((s, g))) s_, f, done = env.step(a) score += f r = reward(g, s_, done) controller.memory.add(np.hstack((s, g)), a, r, np.hstack((s_, g)), done) if (step > controller_start) and (step % 5 == 0): controller.learn() # if (step > meta_controller_start) and (step % 5 == 0): # meta_controller.learn() if step == meta_controller_start: print('\nmeta controller start learn~~~~~~~~~~~~~~~~~~~~') if step == controller_start: print('\ncontroller start learn~~~~~~~~~~~~~~~~~~~~~~~~~~') F = F + f s = s_ step = step + 1 if done: Done = True break if goal_reached(g, s): break episode_step = episode_step + 1 if goal_reached(g, s): meta_controller.memory.add(s0, g, F, s, done) if step > meta_controller_start: meta_controller.learn() if Done: break g = meta_controller.choose_action(s) # print(step) score_list.append(score) if (episode > 0 and episode % 50 == 0): avescore = np.average(np.array(score_list[-50:])) avescore_list.append(avescore) print("\nepisode %d : average score = %f" % (episode, avescore)) testscore = play_maze() testscore_list.append(testscore) print("episode %d : test score = %f\n" % (episode, testscore)) if avescore > 2.5 and not flag[0]: flag[0] = True with open(file, 'a+') as f: f.write('train average score achieves 2.5 (' + str(avescore) + ')\n') # print('game over') # env.destroy() # return elif avescore > 0.5 and not flag[1]: flag[1] = True with open(file, 'a+') as f: f.write('train average score achieves 0.5 (' + str(avescore) + ')\n') if testscore > 2.5 and not flag[2]: flag[2] = True with open(file, 'a+') as f: f.write('test score achieves 2.5 (' + str(testscore) + ')\n') elif testscore > 0.5 and not flag[3]: flag[3] = True with open(file, 'a+') as f: f.write('test score achieves 0.5 (' + str(testscore) + ')\n') if (episode > 0 and episode % 10 == 0): if step > controller_start: print('controller loss:', np.mean(controller.cost_his[np.max( [0, controller.learn_step_counter - 100]):controller.learn_step_counter])) if step > meta_controller_start: print('meta controller loss:', np.mean(meta_controller.cost_his[np.max( [0, meta_controller.learn_step_counter - 100]):meta_controller.learn_step_counter])) # end of game print('game over') env.destroy() plt.plot(range(len(avescore_list)), avescore_list) plt.show() np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_avescore_log.txt" % (seed), avescore_list) np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_testscore_log.txt" % (seed), testscore_list) env.after(100, run_maze, max_episode) env.mainloop() controller.plot_cost() meta_controller.plot_cost()
# move = line.strip().split(',') # numbers_move = [int(l) for l in move ] # # print('aaa') # # print(origin_chess) # move_way = trans_action_to_A(origin_chess, numbers_move) # # print(move_way) # if line.strip().__contains__(',') == False : # move = line.strip().split(',') # numbers_move = [float(l) for l in move ] # print(numbers_move) if __name__ == "__main__": #print(S_test[9,8]) #print(trans_S(S_test)) RL = DeepQNetwork(187, 96, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) file_path = os.getcwd() + '\\ajax.txt' parse_txt(file_path) RL.plot_cost() # s = '11.200000000000728' # print(s.__contains__('.')) # print(np.loadtxt(file_path)) # print(trans_action_to_A(S_test,action_test)) # print(trans_A_to_action(S_test,trans_action_to_A(S_test,action_test)))
def train_dqn(seed, file): np.random.seed(seed) tf.set_random_seed(seed) # dsdp game env = dsdp() max_episode = 20000 dqn_start = 5000 dqn = DeepQNetwork(env.n_actions, env.n_features, 'dqn', optimizer='rmsprop', momentum=0.9, learning_rate=0.00025, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=5e3, replace_target_iter=200, memory_size=10000, output_graph=False, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e5, prioritized_replay_eps=1e-6) def play_maze(): # def goal_reached(g, s): # if(g == 0): # return (s[0]==0) # else: # return ((s[2*g-1]==0) and (s[2*g]==0)) def goal_reached(g, s): return (s[1] == g) s = env.reset() score = 0 while True: env.render() a = dqn.choose_action(s, test=True) s_, r, done = env.step(a) score += r if done: break s = s_ return score def run_maze(max_episode): step = 0 score_list = [] avescore_list = [] testscore_list = [] flag = [False, False, False, False] for episode in range(max_episode): # initial observation s = env.reset() score = 0 while True: env.render() a = dqn.choose_action(s) s_, r, done = env.step(a) score += r dqn.memory.add(s, a, r, s_, done) if (step > dqn_start) and (step % 5 == 0): dqn.learn() if step == dqn_start: print('\ndqn start learn~~~~~~~~~~~~~~~~~~~~') s = s_ step = step + 1 if done: break score_list.append(score) if (episode > 0 and episode % 50 == 0): # average score avescore = np.average(np.array(score_list[-50:])) avescore_list.append(avescore) print("\nepisode %d : average score = %f" % (episode, avescore)) # test score testscore = 0 for i in range(5): testscore += play_maze() testscore /= 5 testscore_list.append(testscore) print("episode %d : test score = %f\n" % (episode, testscore)) # logs if avescore > 0.1 and not flag[0]: flag[0] = True with open(file, 'a+') as f: f.write('train average score achieves 0.1 (' + str(avescore) + ')\n') #print('game over') #env.destroy() #return elif avescore > 0.02 and not flag[1]: flag[1] = True with open(file, 'a+') as f: f.write('train average score achieves 0.02 (' + str(avescore) + ')\n') if testscore > 0.1 and not flag[2]: flag[2] = True with open(file, 'a+') as f: f.write('test score achieves 0.1 (' + str(testscore) + ')\n') elif testscore > 0.02 and not flag[3]: flag[3] = True with open(file, 'a+') as f: f.write('test score achieves 0.02 (' + str(testscore) + ')\n') # loss if (episode > 0 and episode % 10 == 0): if step > dqn_start: print( 'dqn loss:', np.mean( dqn.cost_his[np. max([0, dqn.learn_step_counter - 100]):dqn.learn_step_counter])) # end of game print('game over') env.destroy() plt.plot(range(len(avescore_list)), avescore_list) plt.show() np.savetxt( "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_avescore_log.txt" % (seed), avescore_list) np.savetxt( "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_testscore_log.txt" % (seed), testscore_list) env.after(100, run_maze, max_episode) env.mainloop() dqn.plot_cost()