Example #1
0
    def replay(self, wps, pi_mcts, board_logs, plus_turns, weights,
               batch_size: int, beta: float) -> None:
        inputs = np.zeros((batch_size, 7, 5, 3))
        policy_true = np.zeros((batch_size, 315))
        values_true = np.zeros((batch_size))
        input_weights = np.zeros((batch_size))
        indices = np.random.choice(np.arange(len(wps)),
                                   size=batch_size,
                                   replace=False)
        mini_batch = [(wps[i], pi_mcts[i], board_logs[i], plus_turns[i],
                       weights[i]) for i in indices]

        for i, (winner, pi, board, plus_turn, weight) in enumerate(mini_batch):
            gs = GameState()
            gs.board = board
            inputs[i] = gs.to_inputs(flip=not plus_turn)  # shape=(4, 5, 5)
            policy_true[i] = pi**beta
            values_true[i] = winner
            input_weights[i] = weight

        # epochsは訓練データの反復回数、verbose=0は表示なしの設定
        self.model.fit(inputs, [policy_true, values_true],
                       sample_weight=input_weights,
                       epochs=1,
                       verbose=0,
                       shuffle=True)