def take_action_eps_greedy(board: np.ndarray, episode: int, mainQN: QNetwork, gs: GameState) -> Tuple[Winner, int]: """t+1での行動を返す boardは入力の型(README参照)で与えること returnは勝利判定と打った手""" # 徐々に最適行動のみをとる、ε-greedy法 epsilon = 0.001 + 0.9 / (1.0 + episode) if epsilon <= np.random.uniform(0, 1): retTargetQs = mainQN.model.predict(board)[0] s = gs.outputs_to_move_max(retTargetQs) # 最大の報酬を返す行動を選択する else: s = gs.random_play() # ランダムに行動する return s
def learn(model_config_path=None, weight_path=None): config = Config() qc = config.Qlearn total_reward_vec = np.zeros(qc.num_consecutive_iterations) # 各試行の報酬を格納 # Qネットワークとメモリ、Actorの生成-------------------------------------------------------- if model_config_path is None or weight_path is None: mainQN = QNetwork(config) # メインのQネットワーク mainQN.build() targetQN = QNetwork(config) # 価値を計算するQネットワーク targetQN.build() else: mainQN = QNetwork(config) success_load = mainQN.load(model_config_path, weight_path) if not success_load: raise FileNotFoundError( f"{model_config_path} {weight_path}が読み込めませんでした") targetQN = QNetwork(config) targetQN.load(model_config_path, weight_path) memory = Memory(max_size=qc.memory_size) for episode in trange(qc.num_episodes): # 試行数分繰り返す gs = GameState() state = gs.random_play() # 1step目は適当な行動をとる episode_reward = 0 targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする for t in range(qc.max_number_of_steps): # 2手のループ board = gs.to_inputs() state, action = take_action_eps_greedy(board, episode, mainQN, gs) # 時刻tでの行動を決定する # next_state, reward, done, info = env.step(action) # 行動a_tの実行による、s_{t+1}, _R{t}を計算する # verbose ========== # if t % 10 == 9: # print(gs) # ================== if state == Winner.minus: reward = qc.reward_win # 報酬 else: reward = 0 next_board = gs.to_inputs() # board = next_board # 状態更新 # 1施行終了時の処理 if state != Winner.not_ended: episode_reward += reward # 合計報酬を更新 memory.add((board, action, reward, next_board)) # メモリの更新する # Qネットワークの重みを学習・更新する replay if len(memory) > qc.batch_size: # and not islearned: mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN) if qc.DQN_MODE: targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする total_reward_vec = np.hstack( (total_reward_vec[1:], episode_reward)) # 報酬を記録 print( '%d/%d: Episode finished after %d time steps / mean %f winner: %s' % (episode + 1, qc.num_episodes, t + 1, total_reward_vec.mean(), 'plus' if state == Winner.plus else 'minus')) break state, _ = gs.random_play() if state == Winner.plus: reward = qc.reward_lose else: reward = 0 episode_reward += reward # 合計報酬を更新 memory.add((board, action, reward, next_board)) # メモリの更新する # Qネットワークの重みを学習・更新する replay if len(memory) > qc.batch_size: # and not islearned: mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN) if qc.DQN_MODE: targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする # 1施行終了時の処理 if state != Winner.not_ended: total_reward_vec = np.hstack( (total_reward_vec[1:], episode_reward)) # 報酬を記録 print( '%d/%d: Episode finished after %d time steps / mean %f winner: %s' % (episode + 1, qc.num_episodes, t + 1, total_reward_vec.mean(), 'plus' if state == Winner.plus else 'minus')) break # 複数施行の平均報酬で終了を判断 # if total_reward_vec.mean() >= goal_average_reward: # print('Episode %d train agent successfuly!' % episode) # islearned = True if episode % qc.save_interval == qc.save_interval - 1: d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') mainQN.save(f"results/001_QLearning/{d}-mainQN.json", f"results/001_QLearning/{d}-mainQN.h5") with open(f"results/001_QLearning/{d}-config.json", 'x') as f: json.dump(config._to_dict(), f, indent=4) # 最後に保存(直前にしていればしない) if episode % qc.save_interval != qc.save_interval - 1: d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') mainQN.save(f"results/001_QLearning/{d}-mainQN.json", f"results/001_QLearning/{d}-mainQN.h5") with open(f"results/001_QLearning/{d}-config.json", 'x') as f: json.dump(config._to_dict(), f, indent=4)