Example #1
0
def take_action_eps_greedy(board: np.ndarray, episode: int, mainQN: QNetwork,
                           gs: GameState) -> Tuple[Winner, int]:
    """t+1での行動を返す
    boardは入力の型(README参照)で与えること
    returnは勝利判定と打った手"""
    # 徐々に最適行動のみをとる、ε-greedy法
    epsilon = 0.001 + 0.9 / (1.0 + episode)

    if epsilon <= np.random.uniform(0, 1):
        retTargetQs = mainQN.model.predict(board)[0]
        s = gs.outputs_to_move_max(retTargetQs)  # 最大の報酬を返す行動を選択する

    else:
        s = gs.random_play()  # ランダムに行動する

    return s
Example #2
0
def learn(model_config_path=None, weight_path=None):
    config = Config()
    qc = config.Qlearn

    total_reward_vec = np.zeros(qc.num_consecutive_iterations)  # 各試行の報酬を格納
    # Qネットワークとメモリ、Actorの生成--------------------------------------------------------
    if model_config_path is None or weight_path is None:
        mainQN = QNetwork(config)  # メインのQネットワーク
        mainQN.build()
        targetQN = QNetwork(config)  # 価値を計算するQネットワーク
        targetQN.build()
    else:
        mainQN = QNetwork(config)
        success_load = mainQN.load(model_config_path, weight_path)
        if not success_load:
            raise FileNotFoundError(
                f"{model_config_path} {weight_path}が読み込めませんでした")
        targetQN = QNetwork(config)
        targetQN.load(model_config_path, weight_path)
    memory = Memory(max_size=qc.memory_size)

    for episode in trange(qc.num_episodes):  # 試行数分繰り返す
        gs = GameState()
        state = gs.random_play()  # 1step目は適当な行動をとる
        episode_reward = 0

        targetQN.model.set_weights(
            mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

        for t in range(qc.max_number_of_steps):  # 2手のループ
            board = gs.to_inputs()

            state, action = take_action_eps_greedy(board, episode, mainQN,
                                                   gs)  # 時刻tでの行動を決定する
            # next_state, reward, done, info = env.step(action)   # 行動a_tの実行による、s_{t+1}, _R{t}を計算する

            # verbose ==========
            # if t % 10 == 9:
            #     print(gs)
            # ==================

            if state == Winner.minus:
                reward = qc.reward_win  # 報酬
            else:
                reward = 0

            next_board = gs.to_inputs()

            # board = next_board  # 状態更新
            # 1施行終了時の処理
            if state != Winner.not_ended:
                episode_reward += reward  # 合計報酬を更新
                memory.add((board, action, reward, next_board))  # メモリの更新する
                # Qネットワークの重みを学習・更新する replay
                if len(memory) > qc.batch_size:  # and not islearned:
                    mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN)
                if qc.DQN_MODE:
                    targetQN.model.set_weights(
                        mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

                total_reward_vec = np.hstack(
                    (total_reward_vec[1:], episode_reward))  # 報酬を記録
                print(
                    '%d/%d: Episode finished after %d time steps / mean %f winner: %s'
                    % (episode + 1, qc.num_episodes, t + 1,
                       total_reward_vec.mean(),
                       'plus' if state == Winner.plus else 'minus'))
                break

            state, _ = gs.random_play()

            if state == Winner.plus:
                reward = qc.reward_lose
            else:
                reward = 0

            episode_reward += reward  # 合計報酬を更新
            memory.add((board, action, reward, next_board))  # メモリの更新する

            # Qネットワークの重みを学習・更新する replay
            if len(memory) > qc.batch_size:  # and not islearned:
                mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN)

            if qc.DQN_MODE:
                targetQN.model.set_weights(
                    mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

            # 1施行終了時の処理
            if state != Winner.not_ended:
                total_reward_vec = np.hstack(
                    (total_reward_vec[1:], episode_reward))  # 報酬を記録
                print(
                    '%d/%d: Episode finished after %d time steps / mean %f winner: %s'
                    % (episode + 1, qc.num_episodes, t + 1,
                       total_reward_vec.mean(),
                       'plus' if state == Winner.plus else 'minus'))
                break

        # 複数施行の平均報酬で終了を判断
        # if total_reward_vec.mean() >= goal_average_reward:
        #     print('Episode %d train agent successfuly!' % episode)
        # islearned = True
        if episode % qc.save_interval == qc.save_interval - 1:
            d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
            mainQN.save(f"results/001_QLearning/{d}-mainQN.json",
                        f"results/001_QLearning/{d}-mainQN.h5")
            with open(f"results/001_QLearning/{d}-config.json", 'x') as f:
                json.dump(config._to_dict(), f, indent=4)

    # 最後に保存(直前にしていればしない)
    if episode % qc.save_interval != qc.save_interval - 1:
        d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        mainQN.save(f"results/001_QLearning/{d}-mainQN.json",
                    f"results/001_QLearning/{d}-mainQN.h5")
        with open(f"results/001_QLearning/{d}-config.json", 'x') as f:
            json.dump(config._to_dict(), f, indent=4)