コード例 #1
0
    def self_play_one_game(self, game: ConnectNGame) \
            -> List[Tuple[NetGameState, ActionProbs, NDArray[(Any), np.float]]]:
        """

        :param game:
        :return:
            Sequence of (s, pi, z) of a complete game play. The number of list is the game play length.
        """

        states: List[NetGameState] = []
        probs: List[ActionProbs] = []
        current_players: List[np.float] = []

        while not game.game_over:
            move, move_probs = self._get_action(game)
            states.append(convert_game_state(game))
            probs.append(move_probs)
            current_players.append(game.current_player)
            game.move(move)

        current_player_z = np.zeros(len(current_players))
        current_player_z[np.array(current_players) == game.game_result] = 1.0
        current_player_z[np.array(current_players) == -game.game_result] = -1.0
        self.reset()

        return list(zip(states, probs, current_player_z))
コード例 #2
0
    def _playout(self, game: ConnectNGame, node: TreeNode):
        """Run a single playout from the root to the leaf, getting a value at
        the leaf and propagating it back through its parents.
        State is modified in-place, so a copy must be provided.
        """
        while True:
            if node.is_leaf():
                break
            # Greedily select next move.
            action, node = node.select()
            game.move(action)

        # Evaluate the leaf using a network which outputs a list of
        # (action, probability) tuples p and also a score v in [-1, 1]
        # for the current player.
        action_and_probs, leaf_value = self.rollout_policy_value_fn(game)
        # Check for end of game.
        end, winner = game.game_over, game.game_result
        if not end:
            for action, prob in action_and_probs:
                child_node = node.expand(action, prob)

        player = game.current_player
        result = self._rollout_simulate_to_end(game)
        if result == ConnectNGame.RESULT_TIE:
            leaf_value = float(ConnectNGame.RESULT_TIE)
        else:
            leaf_value = 1.0 if result == player else -1.0

        node.propagate_to_root(leaf_value)
コード例 #3
0
 def _rollout_simulate_to_end(self, game: ConnectNGame) -> GameResult:
     """Use the rollout policy to play until the end of the game,
     returning +1 if the current player wins, -1 if the opponent wins,
     and 0 if it is a tie.
     """
     while True:
         end, result = game.game_over, game.game_result
         if end:
             break
         action_probs = self.rollout_policy_fn(game)
         max_action = max(action_probs, key=itemgetter(1))[0]
         game.move(max_action)
     return result
コード例 #4
0
def train():
    initial_game = ConnectNGame(board_size=args.board_size, n=args.n_in_row)
    game_records: deque[Tuple[NetGameState, ActionProbs, NDArray[(Any),
                                                                 np.float]]]
    game_records = deque(maxlen=args.buffer_size)

    policy_value_net = PolicyValueNet(args.board_size,
                                      args.board_size,
                                      use_gpu=args.use_cuda)
    alpha_go_zero_player = MCTSAlphaGoZeroPlayer(policy_value_net,
                                                 playout_num=args.playout_num)

    for i in range(args.game_batch_num):
        game = copy.deepcopy(initial_game)
        one_game_records = alpha_go_zero_player.self_play_one_game(game)
        episode_len = len(one_game_records)
        game_records.extend(one_game_records)
        logging.warning(
            f'batch i:{i + 1}, episode_len:{episode_len}, records_total:{len(game_records)}'
        )
        if len(game_records) <= args.batch_size:
            continue
        # training
        training_batch = random.sample(game_records, args.batch_size)
        loss, entropy = update_policy(training_batch, policy_value_net)
コード例 #5
0
ファイル: battle.py プロジェクト: MyEncyclopedia/ConnectNGym
def minimax_mcts():
    initial_game = ConnectNGame(board_size=4, n=3)
    strategy = PlannedMinimaxStrategy(initial_game)
    strategy.load_state()

    planned_minimax_agent = AIAgent(strategy)
    mcts_rollout_player = MCTSRolloutPlayer(playout_num=1000)
    battle(initial_game, planned_minimax_agent, mcts_rollout_player, n_games=20)
コード例 #6
0
 def rollout_policy_value_fn(
         self, game: ConnectNGame) -> Tuple[Iterator[MoveWithProb], float]:
     """a function that takes in a state and outputs a list of (action, probability)
     tuples and a score for the state"""
     # return uniform probabilities and 0 score for pure MCTS
     move_list = game.get_avail_pos()
     action_probs = np.ones(len(move_list)) / len(move_list)
     return zip(move_list, action_probs), 0
コード例 #7
0
    def _get_action(self, game: ConnectNGame) -> Tuple[MoveWithProb]:
        epsilon = 0.25
        avail_pos = game.get_avail_pos()
        move_probs: ActionProbs = np.zeros(game.board_size * game.board_size)
        assert len(avail_pos) > 0

        # the pi defined in AlphaGo Zero paper
        acts, act_probs = self._next_step_play_act_probs(game)
        move_probs[list(acts)] = act_probs
        if self._is_training:
            # add Dirichlet Noise when training in favour of exploration
            p_ = (1 - epsilon) * act_probs + epsilon * np.random.dirichlet(
                0.3 * np.ones(len(act_probs)))
            move = np.random.choice(acts, p=p_)
            assert move in game.get_avail_pos()
        else:
            move = np.random.choice(acts, p=act_probs)

        self.reset()
        return move, move_probs
コード例 #8
0
    def action(self, game: ConnectNGame) -> Tuple[GameResult, Pos]:
        game = copy.deepcopy(game)

        player = game.current_player
        best_result = player * -1  # assume opponent win as worst result
        best_move = None
        for move in game.get_avail_pos():
            game.move(move)
            status = game.get_status()
            game.undo()

            result = self.dp_map[status]

            if player == ConnectNGame.PLAYER_A:
                best_result = max(best_result, result)
            else:
                best_result = min(best_result, result)
            # update best_move if any improvement
            best_move = move if best_result == result else best_move
            # print(f'move {move} => {result}')

        # if best_result == game.currentPlayer:
        #     return best_result, move

        return best_result, best_move
コード例 #9
0
def test_model(policy_value_net):
    initial_game = ConnectNGame(board_size=args.board_size, n=args.n_in_row)
    alphago_zero_player = MCTSAlphaGoZeroPlayer(policy_value_net,
                                                playout_num=args.playout_num)
    mcts_rollout_player = MCTSRolloutPlayer(
        playout_num=args.rollout_playout_num)
    win_ratio = battle(initial_game,
                       alphago_zero_player,
                       mcts_rollout_player,
                       n_games=3)
    logging.warning(f'current self-play win_ratio:{win_ratio:.3f}')
    policy_value_net.save_model('./current_policy.model')
    if win_ratio > args.best_win_ratio:
        logging.warning(f'best policy {win_ratio:.3f}')
        best_win_ratio = win_ratio
        # update the best_policy
        policy_value_net.save_model('./best_policy.model')
コード例 #10
0
 def policy_value_fn(
         self, board: ConnectNGame) -> Tuple[Iterator[MoveWithProb], float]:
     """
     input: board
     output: a list of (action, probability) tuples for each available
     action and the score of the board state
     """
     avail_pos_list = board.get_avail_pos()
     game_state = convert_game_state(board)
     current_state = np.ascontiguousarray(
         game_state.reshape(-1, 4, self.board_width, self.board_height))
     if self.use_gpu:
         log_act_probs, value = self.policy_value_net(
             Variable(torch.from_numpy(current_state)).cuda().float())
         pos_probs = np.exp(log_act_probs.data.cpu().numpy().flatten())
     else:
         log_act_probs, value = self.policy_value_net(
             Variable(torch.from_numpy(current_state)).float())
         pos_probs = np.exp(log_act_probs.data.numpy().flatten())
     value = float(value.data[0][0])
     return zip(avail_pos_list, pos_probs), value
コード例 #11
0
    def _playout(self, game: ConnectNGame):
        """
        From current game status, run a sequence down to a leaf node, either because game ends or unexplored node.
        Get the leaf value of the leaf node, either the actual reward of game or action value returned by policy net.
        And propagate upwards to root node.

        :param game:
        """
        player_id = game.current_player

        node = self._current_root
        while True:
            if node.is_leaf():
                break
            act, node = node.select()
            game.move(act)

        # now game state is a leaf node in the tree, either a terminal node or an unexplored node
        act_and_probs: Iterator[MoveWithProb]
        act_and_probs, leaf_value = self._policy_value_net.policy_value_fn(
            game)

        if not game.game_over:
            # case where encountering an unexplored leaf node, update leaf_value estimated by policy net to root
            for act, prob in act_and_probs:
                game.move(act)
                child_node = node.expand(act, prob)
                game.undo()
        else:
            # case where game ends, update actual leaf_value to root
            if game.game_result == ConnectNGame.RESULT_TIE:
                leaf_value = ConnectNGame.RESULT_TIE
            else:
                leaf_value = 1 if game.game_result == player_id else -1
            leaf_value = float(leaf_value)

        # Update leaf_value and propagate up to root node
        node.propagate_to_root(-leaf_value)
コード例 #12
0
                             [self.start_x + self.grid_size * (self.board_size - 1), y], 2)

        for c in range(self.board_size):
            x = self.start_x + c * self.grid_size
            pygame.draw.line(screen, (0, 0, 0), [x, self.start_y],
                             [x, self.start_y + self.grid_size * (self.board_size - 1)], 2)

        for r in range(self.board_size):
            for c in range(self.board_size):
                piece = self.connect_n_game.board[r][c]
                if piece != ConnectNGame.AVAILABLE:
                    if piece == ConnectNGame.PLAYER_A:
                        color = (0, 0, 0)
                    else:
                        color = (255, 255, 255)

                    x = self.start_x + c * self.grid_size
                    y = self.start_y + r * self.grid_size
                    pygame.draw.circle(screen, color, [x, y], self.grid_size // 2)


if __name__ == '__main__':
    connectNGame = ConnectNGame()
    board = PyGameBoard(connectNGame)
    while not board.is_game_over():
        pos = board.next_user_input()
        board.move(pos)


    pygame.quit()
コード例 #13
0
 def rollout_policy_fn(self, game: ConnectNGame) -> Iterator[MoveWithProb]:
     """a coarse, fast version of policy_fn used in the rollout phase."""
     # rollout randomly
     action_probs = np.random.rand(len(game.get_avail_pos()))
     return zip(game.get_avail_pos(), action_probs)
コード例 #14
0
                    self.dp_map[game.get_status()] = result, move
                game.undo()
                ret = max(ret, result)
                best_move = move if ret == result else best_move
            self.dp_map[game_status] = ret, best_move
            return ret, best_move
        else:
            ret = math.inf
            for pos in game.get_avail_pos():
                move = pos
                result = game.move(pos)

                if result is None:
                    assert not game.game_over
                    result, opp_move = self.minimax(game.get_status())
                    self.dp_map[game.get_status()] = result, opp_move
                else:
                    self.dp_map[game.get_status()] = result, move
                game.undo()
                ret = min(ret, result)
                best_move = move if ret == result else best_move
            self.dp_map[game_status] = ret, best_move
            return ret, best_move


if __name__ == '__main__':
    tic_tac_toe = ConnectNGame(n=3, board_size=3)
    strategy = CountingMinimaxStrategy()
    strategy.action(tic_tac_toe)
    print(f'Game States Number {len(strategy.dp_map)}')
コード例 #15
0
 def action(self, game: ConnectNGame) -> Tuple[GameResult, Pos]:
     self.game = copy.deepcopy(game)
     self.dp_map = {}
     result, move = self.minimax(game.get_status())
     return result, move
コード例 #16
0
        board = [[ConnectNGame.AVAILABLE] * N for _ in range(N)]

        for r in range(N):
            for c in range(N):
                board[c][N - 1 - r] = status[r][c]

        return tuple([tuple(board[i]) for i in range(N)])

    def save_state(self):
        import pickle
        with open(
                f'planned_minimax_{self.game.n}_{self.game.board_size}.pickle',
                'wb') as handle:
            pickle.dump(self.dp_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def load_state(self):
        import pickle
        with open(
                f'planned_minimax_{self.game.n}_{self.game.board_size}.pickle',
                'rb') as handle:
            self.dp_map = pickle.load(handle)


if __name__ == '__main__':
    connect_n_game = ConnectNGame(n=3, board_size=4)

    strategy = PlannedMinimaxStrategy(connect_n_game)
    # strategy.save_state()
    strategy.load_state()
    print(strategy.action(connect_n_game))
コード例 #17
0
    play(env, planned_minimax_agent, planned_minimax_agent)


def play(env: ConnectNGym, agent1: BaseAgent, agent2: BaseAgent, render=True) -> GameResult:
    agents = [agent1, agent2]

    env.reset()
    board = env.pygame_board
    done = False
    agent_id = -1
    while not done:
        agent_id = (agent_id + 1) % 2
        agent = agents[agent_id]
        action = agent.get_action(board)
        _, reward, done, info = env.step(action)
        if render:
            env.render()

        if done:
            print(f'result={reward}')
            return reward


if __name__ == '__main__':
    board = PyGameBoard(connect_n_game=ConnectNGame(board_size=3, n=3))
    env = ConnectNGym(board)
    env.render(True)

    play_ai_vs_ai(env)
# play_human_vs_ai(env)
コード例 #18
0
                if result is None:
                    assert not game.game_over
                    self.alpha_beta_stack.append((alpha, beta))
                    result, opp_move = self.alpha_beta_dp(game.get_status())
                    self.alpha_beta_stack.pop()
                game.undo()
                beta = min(beta, result)
                ret = min(ret, result)
                best_move = move if ret == result else best_move
                if alpha >= beta or ret == -1:
                    return ret, move
            return ret, best_move


if __name__ == '__main__':
    tic_tac_toe = ConnectNGame(n=5, board_size=7)
    # strategy = MinimaxDPStrategy(tic_tac_toe)
    strategy = AlphaBetaDPStrategy(tic_tac_toe)
    print(strategy.action())
    sys.exit(1)

    tic_tac_toe = ConnectNGame(N=5, board_size=5)
    # tic_tac_toe.move(0, 0)
    # tic_tac_toe.move(1, 1)
    # tic_tac_toe.move(1, 2)
    # tic_tac_toe.move(1, 0)
    # tic_tac_toe.move(0, 1)
    # tic_tac_toe.drawText()
    # strategy1 = MinimaxDPStrategy(tic_tac_toe)
    # strategy2 = AlphaBetaStrategy(tic_tac_toe)
    # strategy3 = AlphaBetaDPStrategy(tic_tac_toe)