Exemple #1
0
class TestGameState(unittest.TestCase):
    def setUp(self):
        self.gs = GameState()

    def test_outputs_to_move_max(self):
        outputs = np.linspace(0.0, 1.0, 100)
        self.gs.outputs_to_move_max(outputs)
        self.assertTrue((self.gs.board == np.array([[-1, -1, -2, -1, -1],
                                                    [0, 0, 0, 0, 1],
                                                    [0, 0, 0, 0, 0],
                                                    [0, 0, 0, 0, 0],
                                                    [1, 1, 2, 1, 0]])).all())
        self.gs.outputs_to_move_max(outputs)
        self.assertFalse((self.gs.board == np.array([[-1, -1, -2, -1, -1],
                                                     [0, 0, 0, 0, 1],
                                                     [0, 0, 0, 0, 0],
                                                     [0, 0, 0, 0, 0],
                                                     [1, 1, 2, 1, 0]])).all())

    def test_outputs_to_move_random(self):
        outputs = np.linspace(0.0, 1.0, 100)
        outputs /= np.sum(outputs)
        self.gs.outputs_to_move_random(outputs)

    def test_flip(self):
        self.assertTrue((self.gs.to_inputs() == self.gs.to_inputs(True)).all())
Exemple #2
0
    def replay(self, wps, pi_mcts, board_logs, plus_turns, weights,
               batch_size: int, beta: float) -> None:
        inputs = np.zeros((batch_size, 7, 5, 3))
        policy_true = np.zeros((batch_size, 315))
        values_true = np.zeros((batch_size))
        input_weights = np.zeros((batch_size))
        indices = np.random.choice(np.arange(len(wps)),
                                   size=batch_size,
                                   replace=False)
        mini_batch = [(wps[i], pi_mcts[i], board_logs[i], plus_turns[i],
                       weights[i]) for i in indices]

        for i, (winner, pi, board, plus_turn, weight) in enumerate(mini_batch):
            gs = GameState()
            gs.board = board
            inputs[i] = gs.to_inputs(flip=not plus_turn)  # shape=(4, 5, 5)
            policy_true[i] = pi**beta
            values_true[i] = winner
            input_weights[i] = weight

        # epochsは訓練データの反復回数、verbose=0は表示なしの設定
        self.model.fit(inputs, [policy_true, values_true],
                       sample_weight=input_weights,
                       epochs=1,
                       verbose=0,
                       shuffle=True)
Exemple #3
0
def learn(model_config_path=None, weight_path=None):
    config = Config()
    qc = config.Qlearn

    total_reward_vec = np.zeros(qc.num_consecutive_iterations)  # 各試行の報酬を格納
    # Qネットワークとメモリ、Actorの生成--------------------------------------------------------
    if model_config_path is None or weight_path is None:
        mainQN = QNetwork(config)  # メインのQネットワーク
        mainQN.build()
        targetQN = QNetwork(config)  # 価値を計算するQネットワーク
        targetQN.build()
    else:
        mainQN = QNetwork(config)
        success_load = mainQN.load(model_config_path, weight_path)
        if not success_load:
            raise FileNotFoundError(
                f"{model_config_path} {weight_path}が読み込めませんでした")
        targetQN = QNetwork(config)
        targetQN.load(model_config_path, weight_path)
    memory = Memory(max_size=qc.memory_size)

    for episode in trange(qc.num_episodes):  # 試行数分繰り返す
        gs = GameState()
        state = gs.random_play()  # 1step目は適当な行動をとる
        episode_reward = 0

        targetQN.model.set_weights(
            mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

        for t in range(qc.max_number_of_steps):  # 2手のループ
            board = gs.to_inputs()

            state, action = take_action_eps_greedy(board, episode, mainQN,
                                                   gs)  # 時刻tでの行動を決定する
            # next_state, reward, done, info = env.step(action)   # 行動a_tの実行による、s_{t+1}, _R{t}を計算する

            # verbose ==========
            # if t % 10 == 9:
            #     print(gs)
            # ==================

            if state == Winner.minus:
                reward = qc.reward_win  # 報酬
            else:
                reward = 0

            next_board = gs.to_inputs()

            # board = next_board  # 状態更新
            # 1施行終了時の処理
            if state != Winner.not_ended:
                episode_reward += reward  # 合計報酬を更新
                memory.add((board, action, reward, next_board))  # メモリの更新する
                # Qネットワークの重みを学習・更新する replay
                if len(memory) > qc.batch_size:  # and not islearned:
                    mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN)
                if qc.DQN_MODE:
                    targetQN.model.set_weights(
                        mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

                total_reward_vec = np.hstack(
                    (total_reward_vec[1:], episode_reward))  # 報酬を記録
                print(
                    '%d/%d: Episode finished after %d time steps / mean %f winner: %s'
                    % (episode + 1, qc.num_episodes, t + 1,
                       total_reward_vec.mean(),
                       'plus' if state == Winner.plus else 'minus'))
                break

            state, _ = gs.random_play()

            if state == Winner.plus:
                reward = qc.reward_lose
            else:
                reward = 0

            episode_reward += reward  # 合計報酬を更新
            memory.add((board, action, reward, next_board))  # メモリの更新する

            # Qネットワークの重みを学習・更新する replay
            if len(memory) > qc.batch_size:  # and not islearned:
                mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN)

            if qc.DQN_MODE:
                targetQN.model.set_weights(
                    mainQN.model.get_weights())  # 行動決定と価値計算のQネットワークをおなじにする

            # 1施行終了時の処理
            if state != Winner.not_ended:
                total_reward_vec = np.hstack(
                    (total_reward_vec[1:], episode_reward))  # 報酬を記録
                print(
                    '%d/%d: Episode finished after %d time steps / mean %f winner: %s'
                    % (episode + 1, qc.num_episodes, t + 1,
                       total_reward_vec.mean(),
                       'plus' if state == Winner.plus else 'minus'))
                break

        # 複数施行の平均報酬で終了を判断
        # if total_reward_vec.mean() >= goal_average_reward:
        #     print('Episode %d train agent successfuly!' % episode)
        # islearned = True
        if episode % qc.save_interval == qc.save_interval - 1:
            d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
            mainQN.save(f"results/001_QLearning/{d}-mainQN.json",
                        f"results/001_QLearning/{d}-mainQN.h5")
            with open(f"results/001_QLearning/{d}-config.json", 'x') as f:
                json.dump(config._to_dict(), f, indent=4)

    # 最後に保存(直前にしていればしない)
    if episode % qc.save_interval != qc.save_interval - 1:
        d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        mainQN.save(f"results/001_QLearning/{d}-mainQN.json",
                    f"results/001_QLearning/{d}-mainQN.h5")
        with open(f"results/001_QLearning/{d}-config.json", 'x') as f:
            json.dump(config._to_dict(), f, indent=4)