def __init__(self, maze): self.maze = maze self.agMaze = [r.replace('P', '.') for r in maze] self.mission_xml = self.getMissionXML(maze) self.mission = MalmoPython.MissionSpec(self.mission_xml, True) self.mission_record = MalmoPython.MissionRecordSpec() self.malmo = MalmoPython.AgentHost() self.agent = MazeAgent(self) self.attempt = 1 self.goalReached = False
def main(): """ 強化学習の学習環境用の迷路探索問題 ・エージェントの経路選択ロジックは、逐一訪問モンテカルロ法 """ print("Start main()") random.seed(1) np.random.seed(1) #=================================== # 学習環境、エージェント生成フェイズ #=================================== #----------------------------------- # Academy の生成 #----------------------------------- academy = MazeAcademy(max_episode=NUM_EPISODE, max_time_step=NUM_TIME_STEP, save_step=25) #----------------------------------- # Brain の生成 #----------------------------------- # 行動方策のためのパラメーターを表形式(行:状態 s、列:行動 a)で定義 # ※行動方策を表形式で実装するために、これに対応するパラメーターも表形式で実装する。 # 進行方向に壁があって進めない様子を表現するために、壁で進めない方向には `np.nan` で初期化する。 # 尚、状態 s8 は、ゴール状態で行動方策がないため、これに対応するパラメーターも定義しないようにする。 brain_parameters = np.array([ # a0="Up", a1="Right", a3="Down", a4="Left" [np.nan, 1, 1, np.nan], # s0 [np.nan, 1, np.nan, 1], # s1 [np.nan, np.nan, 1, 1], # s2 [1, 1, 1, np.nan], # s3 [np.nan, np.nan, 1, 1], # s4 [1, np.nan, np.nan, np.nan], # s5 [1, np.nan, np.nan, np.nan], # s6 [1, 1, np.nan, np.nan], # s7 ]) brain = MazeEveryVisitMCBrain(n_states=AGANT_NUM_STATES, n_actions=AGANT_NUM_ACTIONS, brain_parameters=brain_parameters, epsilon=BRAIN_GREEDY_EPSILON, gamma=BRAIN_GAMMDA) #----------------------------------- # Agent の生成 #----------------------------------- agent = MazeAgent(brain=brain, gamma=BRAIN_GAMMDA, state0=AGENT_INIT_STATE) # Agent の Brain を設定 agent.set_brain(brain) # 学習環境に作成したエージェントを追加 academy.add_agent(agent) agent.print("after init()") brain.print("after init()") #=================================== # エピソードの実行 #=================================== academy.academy_run() agent.print("after simulation") brain.print("after simulation") #=================================== # 実行結果の描写処理 #=================================== #--------------------------------------------- # 利得の履歴の plot #--------------------------------------------- reward_historys = agent.get_reward_historys() plt.clf() plt.plot( range(0, NUM_EPISODE + 1), reward_historys, label='gamma = {}'.format(BRAIN_GAMMDA), linestyle='-', #linewidth = 2, color='black') plt.title("Reward History") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.legend(loc="lower right") plt.tight_layout() plt.savefig( "MazaSimple_EveryVisitMC_Reward_episode{}.png".format(NUM_EPISODE), dpi=300, bbox_inches="tight") plt.show() #--------------------------------------------- # 状態 s0 ~ s7 での状態価値関数の値を plot # ※ 状態 s8 は、ゴール状態で行動方策がないため、これに対応する状態価値関数も定義されない。 #--------------------------------------------- # 各エピソードでの状態価値関数 v_function_historys = agent.get_v_function_historys() # list<ndarray> / shape=[n_episode,n_state] # → list[ndarray] / shape = [n_episode,] v_function_historys_s0 = [] v_function_historys_s1 = [] v_function_historys_s2 = [] v_function_historys_s3 = [] v_function_historys_s4 = [] v_function_historys_s5 = [] v_function_historys_s6 = [] v_function_historys_s7 = [] v_function_historys_s8 = [] for v_function in v_function_historys: v_function_historys_s0.append(v_function[0]) v_function_historys_s1.append(v_function[1]) v_function_historys_s2.append(v_function[2]) v_function_historys_s3.append(v_function[3]) v_function_historys_s4.append(v_function[4]) v_function_historys_s5.append(v_function[5]) v_function_historys_s6.append(v_function[6]) v_function_historys_s7.append(v_function[7]) v_function_historys_s8.append(0) plt.clf() # S0 plt.subplot(3, 3, 1) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s0, label='S0', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S0") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S1 plt.subplot(3, 3, 2) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s1, label='S1', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S1") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S2 plt.subplot(3, 3, 3) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s2, label='S2', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S2") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S3 plt.subplot(3, 3, 4) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s3, label='S3', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S3") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S4 plt.subplot(3, 3, 5) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s4, label='S4', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S4") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S5 plt.subplot(3, 3, 6) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s5, label='S5', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S5") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S6 plt.subplot(3, 3, 7) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s6, label='S6', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S6") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S7 plt.subplot(3, 3, 8) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s7, label='S7', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S7") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() # S8 plt.subplot(3, 3, 9) plt.plot( range(0, NUM_EPISODE + 1), v_function_historys_s8, label='S8', linestyle='-', #linewidth = 2, color='black') plt.title("V functions / S8") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.tight_layout() plt.savefig( "MazaSimple_EveryVisitMC_VFunction_episode{}.png".format(NUM_EPISODE), dpi=300, bbox_inches="tight") plt.show() #--------------------------------------------- # 行動価値関数を plot #--------------------------------------------- Q_function_historys = agent.get_q_function_historys() Q_function = Q_function_historys[-1] def draw_q_function(q_func): """ Q関数をグリッド上に分割したヒータマップで描写する。 | |↑ | | |←|平均|→| | |↓| | """ import matplotlib.cm as cm # color map n_row = 3 # Maze の行数 n_col = 3 # Maze の列数 n_qrow = n_row * 3 n_qcol = n_col * 3 q_draw_map = np.zeros(shape=(n_qrow, n_qcol)) for i in range(n_row): for j in range(n_col): k = i * n_row + j # 状態の格子番号 if (k == 8): break _i = 1 + (n_row - 1 - i) * 3 _j = 1 + j * 3 q_draw_map[_i][_j - 1] = q_func[k][3] # Left q_draw_map[_i - 1][_j] = q_func[k][2] # Down q_draw_map[_i][_j + 1] = q_func[k][1] # Right q_draw_map[_i + 1][_j] = q_func[k][0] # Up q_draw_map[_i][_j] = np.mean(q_func[k]) q_draw_map = np.nan_to_num(q_draw_map) #print( "q_draw_map :", q_draw_map ) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.imshow(q_draw_map, cmap=cm.RdYlGn, interpolation="bilinear", vmax=abs(q_draw_map).max(), vmin=-abs(q_draw_map).max()) plt.colorbar() ax.set_xlim(-0.5, n_qcol - 0.5) ax.set_ylim(-0.5, n_qrow - 0.5) ax.set_xticks(np.arange(-0.5, n_qcol, 3)) ax.set_yticks(np.arange(-0.5, n_qrow, 3)) #ax.set_xticklabels( range(n_col+1) ) #ax.set_yticklabels( range(n_row+1) ) # 壁を描く ax.plot([1 * n_col - 0.5, 1 * n_row - 0.5], [0 * n_col - 0.5, 1 * n_row - 0.5], color='black', linewidth=2) ax.plot([1 * n_col - 0.5, 2 * n_row - 0.5], [2 * n_col - 0.5, 2 * n_row - 0.5], color='black', linewidth=2) ax.plot([2 * n_col - 0.5, 2 * n_row - 0.5], [2 * n_col - 0.5, 1 * n_row - 0.5], color='black', linewidth=2) ax.plot([2 * n_col - 0.5, 3 * n_row - 0.5], [1 * n_col - 0.5, 1 * n_row - 0.5], color='black', linewidth=2) # 状態を示す文字S0~S8を描く ax.text(0.5 * n_col - 0.5, 2.5 * n_row - 0.5, 'S0', size=14, ha='center') ax.text(0.5 * n_col - 0.5, 2.3 * n_row - 0.5, 'START', ha='center') ax.text(0.5 * n_col - 0.5, 2.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(1.5 * n_col - 0.5, 2.5 * n_row - 0.5, 'S1', size=14, ha='center') ax.text(1.5 * n_col - 0.5, 2.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(2.5 * n_col - 0.5, 2.5 * n_row - 0.5, 'S2', size=14, ha='center') ax.text(2.5 * n_col - 0.5, 2.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(0.5 * n_col - 0.5, 1.5 * n_row - 0.5, 'S3', size=14, ha='center') ax.text(0.5 * n_col - 0.5, 1.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(1.5 * n_col - 0.5, 1.5 * n_row - 0.5, 'S4', size=14, ha='center') ax.text(1.5 * n_col - 0.5, 1.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(2.5 * n_col - 0.5, 1.5 * n_row - 0.5, 'S5', size=14, ha='center') ax.text(2.5 * n_col - 0.5, 1.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(0.5 * n_col - 0.5, 0.5 * n_row - 0.5, 'S6', size=14, ha='center') ax.text(0.5 * n_col - 0.5, 0.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(1.5 * n_col - 0.5, 0.5 * n_row - 0.5, 'S7', size=14, ha='center') ax.text(1.5 * n_col - 0.5, 0.1 * n_row - 0.5, 'reward : -0.01', ha='center') ax.text(2.5 * n_col - 0.5, 0.5 * n_row - 0.5, 'S8', size=14, ha='center') ax.text(2.5 * n_col - 0.5, 0.3 * n_row - 0.5, 'GOAL', ha='center') ax.text(2.5 * n_col - 0.5, 0.1 * n_row - 0.5, 'reward : +1.0', ha='center') # 軸を消す plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off', right='off', left='off', labelleft='off') ax.grid(which="both") plt.title("Q function") plt.savefig("MazaSimple_EveryVisitMC_Qfunction_episode{}.png".format( NUM_EPISODE), dpi=200, bbox_inches="tight") plt.show() return draw_q_function(Q_function) print("Finish main()") return
def main(): """ 強化学習の学習環境用の迷路探索問題 ・エージェントの経路選択ロジックはSarsa """ print('Start main()') random.seed(1) np.random.seed(1) #========================== # 学習環境・エージジェント生成 #========================== #-------------------------- # Academy(envの役割)の生成 #-------------------------- academy = MazeAcademy(max_episode=NUM_EPISODE, max_time_step=NUM_TIME_STEP, save_step=25) #-------------------------- # Brainの生成 #-------------------------- # 行動方策のためのパラメータを表形式(行:状態s、列:行動a)で定義 # *行動方策を表形式で実装するために、対応するパラメータも表形式で実装する # 進行方向に壁があって進めない場合は、np.nanで初期化する # s8はゴール状態であるため、パラメータは定義しない # ----------- # | s0 s1 s2| # | -- | # | s3 s4|s5| # | --| # | s6|s7 s8| # ----------- brain_parameters = np.array([ # a0='Up', a1='Right', a2='Down', a3='Left' [np.nan, 1, 1, np.nan], # s0 [np.nan, 1, np.nan, 1], # s1 [np.nan, np.nan, 1, 1], # s2 [1, 1, 1, np.nan], # s3 [np.nan, np.nan, 1, 1], # s4 [1, np.nan, np.nan, np.nan], # s5 [1, np.nan, np.nan, np.nan], # s6 [1, 1, np.nan, np.nan], # s7 ]) brain = MazeSarsaBrain(n_states=AGENT_NUM_STATES, n_actions=AGENT_NUM_ACTIONS, brain_parameters=brain_parameters, epsilon=BRAIN_GREEDY_EPSILON, gamma=BRAIN_GAMMA, learning_rate=BRAIN_LEARNING_RATE) #--------------------------- # Agentの生成 #--------------------------- agent = MazeAgent(brain=brain, gamma=BRAIN_GAMMA, state0=AGENT_INIT_STATE) # AgentのBrainを設定 agent.set_brain(brain) # 学習環境に作成したAgentを追加 academy.add_agent(agent) agent.print('after init()') brain.print('after init()') #============================ # エピソードの実行 #============================ academy.academy_run() agent.print('after simulation') brain.print('after simulation') return
class Environment: # Problem-specific constants GOAL_BLOCK = "diamond_block" WARN_BLOCK = "obsidian" SAFE_BLOCK = "cobblestone" def __init__(self, maze): self.maze = maze self.agMaze = [r.replace('P', '.') for r in maze] self.mission_xml = self.getMissionXML(maze) self.mission = MalmoPython.MissionSpec(self.mission_xml, True) self.mission_record = MalmoPython.MissionRecordSpec() self.malmo = MalmoPython.AgentHost() self.agent = MazeAgent(self) self.attempt = 1 self.goalReached = False def __translateCoord(self, c, r): return(len(self.maze[0]) -1 - c, len(self.maze) - 1 - r) def __generatePit(self, c, r): maze = self.maze # Generate pit result = '<DrawBlock x="{}" y="5" z="{}" type="lava"/>'.format(c, r) # Generate obsidian warning blocks, as long as there isn't another tile already # there, and the tile would not be generated outside of the maze's bounds #(probably a better way to do this, but meh) obsidians = list(filter(lambda b: b[0] != 0 and b[1] != 0 and b[0] < len(maze[0]) and b[1] < len(maze) and maze[len(maze) - b[1] - 1][len(maze[0]) - b[0] - 1] == ".",\ [(c-1, r), (c+1, r), (c, r-1), (c, r+1)])) for coord in obsidians: result = result + '<DrawBlock x="{}" y="{}" z="{}" type="{}"/>'.format( coord[0], FLOOR_LEVEL, coord[1], Environment.WARN_BLOCK) return result def __generateMaze(self): # maze grid is symmetrical maze = self.maze rows = len(maze) cols = len(maze[0]) # Base grid at 0,0 result = '<DrawCuboid x1="0" y1="{}" z1="0" x2="{}" y2="{}" z2="{}" type="{}"/>'.format(FLOOR_LEVEL, cols-1, FLOOR_LEVEL, rows-1, Environment.SAFE_BLOCK) # Parse special cell contents for r, row in enumerate(maze): for c, cell in enumerate(row): tcoord = self.__translateCoord(c, r) if cell == "*": self.playerPos = tcoord elif cell == "X": result = result + '<DrawBlock x="{}" y="{}" z="{}" type="{}"/>'.format(tcoord[0], FLOOR_LEVEL + 1, tcoord[1], Environment.SAFE_BLOCK) elif cell == "G": result = result + '<DrawBlock x="{}" y="{}" z="{}" type="{}"/>'.format(tcoord[0], FLOOR_LEVEL, tcoord[1], Environment.GOAL_BLOCK) elif cell == "P": result = result + self.__generatePit(tcoord[0], tcoord[1]) return result def getMissionXML(self, maze): mazeXML = self.__generateMaze() return '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <About> <Summary>Maze Pitfalls!</Summary> </About> <ServerSection> <ServerInitialConditions> <Time> <StartTime>1000</StartTime> <AllowPassageOfTime>false</AllowPassageOfTime> </Time> <Weather>clear</Weather> </ServerInitialConditions> <ServerHandlers> <FlatWorldGenerator generatorString="3;7,2*3,11;8;village"/> <DrawingDecorator> <DrawCuboid x1="-50" y1="''' + str(FLOOR_LEVEL) + '''" z1="-50" x2="50" y2="''' + str(FLOOR_LEVEL) + '''" z2="50" type="air"/> ''' + mazeXML + ''' </DrawingDecorator> <ServerQuitFromTimeUp timeLimitMs="''' + str(MISSION_TIME) + '''"/> <ServerQuitWhenAnyAgentFinishes/> </ServerHandlers> </ServerSection> <AgentSection mode="Survival"> <Name>BlindBot</Name> <AgentStart> <Placement x="''' + str(self.playerPos[0] + 0.5) + '''" y="''' + str(FLOOR_LEVEL + 1.0) + '''" z="''' + str(self.playerPos[1] + 0.5) + '''"/> </AgentStart> <AgentHandlers> <DiscreteMovementCommands/> <AgentQuitFromTouchingBlockType> <Block type="lava"/> </AgentQuitFromTouchingBlockType> <ObservationFromGrid> <Grid name="floor3x3"> <min x="-1" y="-1" z="-1"/> <max x="1" y="-1" z="1"/> </Grid> </ObservationFromGrid> </AgentHandlers> </AgentSection> </Mission>''' def goalTest(self, block): if block == Environment.GOAL_BLOCK: self.goalReached = True print("[$] Mission Successful! Deaths: " + str(self.attempt - 1)) def startMission(self): if self.attempt <= MAX_RETRIES: print("[~] Starting mission - Attempt #" + str(self.attempt)) self.malmo.startMission(self.mission, self.mission_record) world_state = self.malmo.getWorldState() while not world_state.has_mission_begun: sys.stdout.write(".") time.sleep(0.1) world_state = self.malmo.getWorldState() print("[!] Mission started!") while world_state.is_mission_running and not self.goalReached: time.sleep(TICK_LENGTH) self.agent.act() world_state = self.malmo.getWorldState() if not self.goalReached: self.agent.die() self.attempt += 1 self.startMission() else: print("[X] GAME OVER!")
def main(): """ 強化学習の学習環境用の迷路探索問題 ・エージェントの経路選択ロジックは、等確率な方向から1つを無作為選択 """ print("Start main()") random.seed(8) np.random.seed(8) #=================================== # 学習環境、エージェント生成フェイズ #=================================== #----------------------------------- # Academy の生成 #----------------------------------- academy = MazeAcademy(max_episode=NUM_EPISODE, max_time_step=NUM_TIME_STEP, save_step=25) #----------------------------------- # Brain の生成 #----------------------------------- # 行動方策のためのパラメーターを表形式(行:状態 s、列:行動 a)で定義 # ※行動方策を表形式で実装するために、これに対応するパラメーターも表形式で実装する。 # 進行方向に壁があって進めない様子を表現するために、壁で進めない方向には `np.nan` で初期化する。 # 尚、状態 s8 は、ゴール状態で行動方策がないため、これに対応するパラメーターも定義しないようにする。 brain_parameters = np.array([ # a0="Up", a1="Right", a3="Down", a4="Left" [np.nan, 1, 1, np.nan], # s0 [np.nan, 1, np.nan, 1], # s1 [np.nan, np.nan, 1, 1], # s2 [1, 1, 1, np.nan], # s3 [np.nan, np.nan, 1, 1], # s4 [1, np.nan, np.nan, np.nan], # s5 [1, np.nan, np.nan, np.nan], # s6 [1, 1, np.nan, np.nan], # s7 ]) brain = MazeRamdomBrain(n_states=AGANT_NUM_STATES, n_actions=AGANT_NUM_ACTIONS, brain_parameters=brain_parameters) #----------------------------------- # Agent の生成 #----------------------------------- agent = MazeAgent(brain=brain, gamma=BRAIN_GAMMDA, state0=AGENT_INIT_STATE) # Agent の Brain を設定 agent.set_brain(brain) # 学習環境に作成したエージェントを追加 academy.add_agent(agent) agent.print("after init()") brain.print("after init()") #=================================== # エピソードの実行 #=================================== academy.academy_run() agent.print("after simulation") brain.print("after simulation") #=================================== # 実行結果の描写処理 #=================================== #--------------------------------------------- # 利得の履歴の plot #--------------------------------------------- reward_historys = agent.get_reward_historys() plt.clf() plt.plot( range(0, NUM_EPISODE + 1), reward_historys, label='gamma = {}'.format(BRAIN_GAMMDA), linestyle='-', #linewidth = 2, color='black') plt.title("Reward History") plt.xlim(0, NUM_EPISODE + 1) #plt.ylim( [-0.1, 1.05] ) plt.xlabel("Episode") plt.grid() plt.legend(loc="lower right") plt.tight_layout() plt.savefig("MazaSimple_Ramdom_Reward_episode{}.png".format(NUM_EPISODE), dpi=300, bbox_inches="tight") plt.show() print("Finish main()") return