Exemple #1
0
    def test_mcts_from_root_with_equal_priors(self):
        class MockModel:
            def predict(self, board):
                # starting board is:
                # [0, 0, 1, -1]
                return np.array([0.26, 0.24, 0.24, 0.26]), 0.0001

        game = Connect2Game()
        args = {'num_simulations': 50}

        model = MockModel()
        mcts = MCTS(game, model, args)
        canonical_board = [0, 0, 0, 0]
        print("starting")
        root = mcts.run(model,
                        canonical_board,
                        to_play=1,
                        add_exploration_noise=False)

        # the best move is to play at index 1 or 2
        best_outer_move = max(root.children[0].visit_count,
                              root.children[0].visit_count)
        best_center_move = max(root.children[1].visit_count,
                               root.children[2].visit_count)
        self.assertGreater(best_center_move, best_outer_move)
 def __init__(self,
              policy_value_fn,
              c_puct=5,
              n_playout=2000,
              is_selfplay=0):
     self.mcts = MCTS(policy_value_fn, c_puct, n_playout)
     self.is_selfplay = is_selfplay
Exemple #3
0
def play_game():
    tree = MCTS()
    board = new_domineering_board()
    board.to_pretty_string()
    while True:
        row_col = input("enter row,col: ")
        row, col = map(int, row_col.split(","))
        stdout.write('You choose ({}, {})'.format(row, col))
        index = conf.BOARD_Y_SIZE * (row - 1) + (col - 1)
        if (board.tup[index] is not None) and (
                board.is_valid_move(index + conf.BOARD_Y_SIZE)):
            raise RuntimeError("Invalid move")
        board = board.make_move(index)
        board.to_pretty_string()
        if board.terminal:
            stdout.write("\nWinner is {}".format(
                conf.PLAYERS_NAME[board.winner]))
            break
        # You can train as you go, or only at the beginning.
        # Here, we train as we go, doing fifty rollouts each turn.
        for _ in range(conf.TRAINING_EPOCHS):
            tree.do_rollout(board)
        board = tree.choose(board)
        board.to_pretty_string()
        if board.terminal:
            stdout.write("\nWinner is {}".format(
                conf.PLAYERS_NAME[board.winner]))
            break
Exemple #4
0
    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = self.game.get_init_board()

        while True:
            canonical_board = self.game.get_canonical_board(
                state, current_player)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [0 for _ in range(self.game.get_action_size())]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temperature=0)
            state, current_player = self.game.get_next_state(
                state, current_player, action)
            reward = self.game.get_reward_for_player(state, current_player)

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    ret.append(
                        (hist_state, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))

                return ret
Exemple #5
0
def play_game():
    tree = MCTS()
    board = new_tic_tac_toe_board()
    print(board.to_pretty_string())
    while True:
        row_col = input("enter row,col: ")
        row, col = map(int, row_col.split(","))
        index = 3 * (row - 1) + (col - 1)
        if board.tup[index] is not None:
            raise RuntimeError("Invalid move")
        board = board.make_move(index)
        print(board.to_pretty_string())
        if board.terminal:
            break
        # You can train as you go, or only at the beginning.
        # Here, we train as we go, doing fifty rollouts each turn.
        for _ in range(2):
            tree.do_rollout(board)
            print(tree.children)
            print(len(tree.children[board]))
            for b in tree.children[board]:
                print(colored(b.to_pretty_string(), 'green'))
        board = tree.choose(board)
        print(board.to_pretty_string())
        if board.terminal:
            break
Exemple #6
0
def play_game_ocba(budget=1000, optimum=0, n0=5, sigma_0=1):
    mcts = MCTS(policy='ocba',
                budget=budget,
                optimum=optimum,
                n0=n0,
                sigma_0=sigma_0)
    tree = new_tree()

    for _ in range(budget):
        mcts.do_rollout(tree)
    next_tree = mcts.choose(tree)

    return (mcts, tree, next_tree)
Exemple #7
0
class myPlayer(AdvancePlayer):
    def __init__(self, _id):
        super().__init__(_id)
        self.opponent_id = None
        self.tree = MCTS()

    def SelectMove(self, moves, game_state):

        self.opponent_id = next(
            filter(lambda ps: ps.id != self.id, game_state.players)).id
        self.board = new_azul_board(game_state, self.id, self.opponent_id)
        for _ in range(30):
            self.tree.do_rollout(self.board)
        best_board = self.tree.choose(self.board)

        return best_board.game_state.lastmove
Exemple #8
0
    def test_mcts_finds_best_move_with_equal_priors(self):
        class MockModel:
            def predict(self, board):
                return np.array([0.51, 0.49, 0, 0]), 0.0001

        game = Connect2Game()
        args = {'num_simulations': 25}

        model = MockModel()
        mcts = MCTS(game, model, args)
        canonical_board = [0, 0, -1, 1]
        root = mcts.run(model, canonical_board, to_play=1)

        # the better move is to play at index 1
        self.assertLess(root.children[0].visit_count,
                        root.children[1].visit_count)
Exemple #9
0
def play_game_uct(budget=1000,
                  exploration_weight=1,
                  optimum=0,
                  n0=2,
                  sigma_0=1):
    mcts = MCTS(policy='uct',
                exploration_weight=exploration_weight,
                budget=budget,
                n0=n0,
                sigma_0=sigma_0)
    tree = new_tree()

    for _ in range(budget):
        mcts.do_rollout(tree)

    next_tree = mcts.choose(tree)

    return (mcts, tree, next_tree)
Exemple #10
0
    def exceute_episode(self):

        train_examples = []
        current_player = 1
        episode_step = 0
        state = self.game.get_init_board()

        while True:
            episode_step += 1

            canonical_board = self.game.get_canonical_board(
                state, current_player)

            temp = int(episode_step < self.args['tempThreshold'])
            add_exploration_noise = temp > 0

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model,
                                 canonical_board,
                                 to_play=1,
                                 add_exploration_noise=add_exploration_noise)

            action_probs = [0 for _ in range(self.game.get_action_size())]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temp)
            state, current_player = self.game.get_next_state(
                state, current_player, action)
            reward = self.game.get_game_ended(state, current_player)

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    ret.append(
                        (hist_state, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))

                return ret
Exemple #11
0
    def test_mcts_finds_best_move_with_really_bad_priors(self):
        class MockModel:
            def predict(self, board):
                # starting board is:
                # [0, 0, 1, -1]
                return np.array([0.3, 0.7, 0, 0]), 0.0001

        game = Connect2Game()
        args = {'num_simulations': 25}

        model = MockModel()
        mcts = MCTS(game, model, args)
        canonical_board = [0, 0, 1, -1]
        print("starting")
        root = mcts.run(model, canonical_board, to_play=1)

        # the best move is to play at index 1
        self.assertGreater(root.children[1].visit_count,
                           root.children[0].visit_count)
    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = gogame.init_state(self.args['boardSize'])

        while True:
            #print("while True")
            canonical_board = gogame.canonical_form(state)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [
                0 for _ in range((self.args['boardSize'] *
                                  self.args['boardSize']) + 1)
            ]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temperature=1)
            state = gogame.next_state(state, action, canonical=False)
            current_player = -current_player
            reward = gogame.winning(
                state) * current_player if gogame.game_ended(state) else None

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    tfBoard = np.array(
                        [hist_state[0], hist_state[1],
                         hist_state[3]]).transpose().tolist()
                    #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))]))
                    ret.append(
                        (tfBoard, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))
                return ret
def mcts_playout(depth, num_iter, num_rollout, exploration_weight):
    root, leaf_nodes_dict = make_binary_tree(depth=depth)
    leaf_nodes_dict_sorted = sorted(leaf_nodes_dict.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
    print("Expected (max) leaf node: {}, value: {}".format(
        leaf_nodes_dict_sorted[0][0], leaf_nodes_dict_sorted[0][1]))
    print("Expected (min) leaf node: {}, value: {}".format(
        leaf_nodes_dict_sorted[-1][0], leaf_nodes_dict_sorted[-1][1]))

    mcts = MCTS(exploration_weight=exploration_weight)
    while True:
        # we run MCTS simulation for many times
        for _ in range(num_iter):
            mcts.run(root, num_rollout=num_rollout)
        # we choose the best greedy action based on simulation results
        root = mcts.choose(root)
        # we repeat until root is terminal
        if root.is_terminal():
            print("Found optimal (max) leaf node: {}, value: {}".format(
                root, root.value))
            return root.value
Exemple #14
0
    def test_mcts_finds_best_move_with_really_really_bad_priors(self):
        class MockModel:
            def predict(self, board):
                # starting board is:
                # [-1, 0, 0, 0]
                return np.array([0, 0.3, 0.3, 0.3]), 0.0001

        game = Connect2Game()
        args = {'num_simulations': 100}

        model = MockModel()
        mcts = MCTS(game, model, args)
        canonical_board = [-1, 0, 0, 0]
        root = mcts.run(model,
                        canonical_board,
                        to_play=1,
                        add_exploration_noise=False)

        # the best move is to play at index 1
        self.assertGreater(root.children[1].visit_count,
                           root.children[2].visit_count)
        self.assertGreater(root.children[1].visit_count,
                           root.children[3].visit_count)
Exemple #15
0
class MCTSAI(AI):
    tree = MCTS()

    def __init__(self, name: str, nRollout: int = 5):
        self.nRollout: int = nRollout
        super().__init__(name)

    def play(self, state: ReversiState):
        for _ in range(self.nRollout):
            self.tree.do_rollout(state)
        return self.tree.choose(state)

        to_char = lambda v: ("⚫" if v is True else ("⚪"
                                                    if v is False else " "))
class MCTSPlayer:
    """AI player based on MCTS"""
    def __init__(self,
                 policy_value_fn,
                 c_puct=5,
                 n_playout=2000,
                 is_selfplay=0):
        self.mcts = MCTS(policy_value_fn, c_puct, n_playout)
        self.is_selfplay = is_selfplay

    def reset_player(self):
        self.mcts.update_with_move(-1)

    def get_action(self, board, temp=1e-3, return_prob=0):
        sensible_moves = board.possible_moves()
        # the pi vector returned by MCTS as in the alphaGo Zero paper
        move_probs = np.zeros(217)
        if sensible_moves:
            acts, probs = self.mcts.get_move_probs(board, temp)
            counter = 0
            for act in acts:
                index = action_space[act]
                move_probs[index] = probs[counter]
                counter += 1

            if self.is_selfplay:
                # add Dirichlet Noise for exploration (needed for
                # self-play training)
                move = np.random.choice(
                    acts,
                    p=0.75 * probs +
                    0.25 * np.random.dirichlet(0.3 * np.ones(len(probs))))
                # update the root node and reuse the search tree
                self.mcts.update_with_move(move)
            else:
                # with the default temp=1e-3, it is almost equivalent
                # to choosing the move with the highest prob
                move = np.random.choice(acts, p=probs)
                # reset the root node
                self.mcts.update_with_move(-1)

            if return_prob:
                return move, move_probs
            else:
                return move
        else:
            print("WARNING ! Game is over.")
Exemple #17
0
 def __init__(self, game, model, args):
     self.game = game
     self.model = model
     self.args = args
     self.mcts = MCTS(self.game, self.model, self.args)
Exemple #18
0
class Trainer:
    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args
        self.mcts = MCTS(self.game, self.model, self.args)

    def exceute_episode(self):

        train_examples = []
        current_player = 1
        episode_step = 0
        state = self.game.get_init_board()

        while True:
            episode_step += 1

            canonical_board = self.game.get_canonical_board(
                state, current_player)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [0 for _ in range(self.game.get_action_size())]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temperature=0)
            state, current_player = self.game.get_next_state(
                state, current_player, action)
            reward = self.game.get_reward_for_player(state, current_player)

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    ret.append(
                        (hist_state, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))

                return ret

    def learn(self):
        for i in range(1, self.args['numIters'] + 1):

            print("{}/{}".format(i, self.args['numIters']))

            train_examples = []

            for eps in range(self.args['numEps']):
                iteration_train_examples = self.exceute_episode()
                train_examples.extend(iteration_train_examples)

            shuffle(train_examples)
            self.train(train_examples)
            filename = self.args['checkpoint_path']
            self.save_checkpoint(folder=".", filename=filename)

    def train(self, examples):
        optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
        pi_losses = []
        v_losses = []

        for epoch in range(self.args['epochs']):
            self.model.train()

            batch_idx = 0

            while batch_idx < int(len(examples) / self.args['batch_size']):
                sample_ids = np.random.randint(len(examples),
                                               size=self.args['batch_size'])
                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # predict
                boards = boards.contiguous().cuda()
                target_pis = target_pis.contiguous().cuda()
                target_vs = target_vs.contiguous().cuda()

                # compute output
                out_pi, out_v = self.model(boards)
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                pi_losses.append(float(l_pi))
                v_losses.append(float(l_v))

                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                batch_idx += 1

            print()
            print("Policy Loss", np.mean(pi_losses))
            print("Value Loss", np.mean(v_losses))
            print("Examples:")
            print(out_pi[0].detach())
            print(target_pis[0])

    def loss_pi(self, targets, outputs):
        loss = -(targets * torch.log(outputs)).sum(dim=1)
        return loss.mean()

    def loss_v(self, targets, outputs):
        loss = torch.sum((targets - outputs.view(-1))**2) / targets.size()[0]
        return loss

    def save_checkpoint(self, folder, filename):
        if not os.path.exists(folder):
            os.mkdir(folder)

        filepath = os.path.join(folder, filename)
        torch.save({
            'state_dict': self.model.state_dict(),
        }, filepath)
class Trainer:
    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args
        self.mcts = MCTS(self.game, self.model, self.args)

    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = gogame.init_state(self.args['boardSize'])

        while True:
            #print("while True")
            canonical_board = gogame.canonical_form(state)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [
                0 for _ in range((self.args['boardSize'] *
                                  self.args['boardSize']) + 1)
            ]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temperature=1)
            state = gogame.next_state(state, action, canonical=False)
            current_player = -current_player
            reward = gogame.winning(
                state) * current_player if gogame.game_ended(state) else None

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    tfBoard = np.array(
                        [hist_state[0], hist_state[1],
                         hist_state[3]]).transpose().tolist()
                    #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))]))
                    ret.append(
                        (tfBoard, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))
                return ret

    def learn(self):
        for i in range(1, self.args['numIters'] + 1):

            print("numIters: {}/{}".format(i, self.args['numIters']))

            train_examples = []

            for eps in range(self.args['numEps']):
                print("numEps: {}/{}".format(eps, self.args['numEps']))
                iteration_train_examples = self.exceute_episode()
                train_examples.extend(iteration_train_examples)

            shuffle(train_examples)
            self.train(train_examples)

    def train(self, trainD):

        # Define the checkpoint
        checkpoint = keras.callbacks.ModelCheckpoint(
            self.args['checkpointPath'],
            monitor="val_loss",
            mode="min",
            save_best_only=True,
            verbose=0)

        # train the network
        print("Training network...")

        x = [i[0] for i in trainD]
        x = np.array(x)

        y1 = [i[1] for i in trainD]
        y2 = [i[2] for i in trainD]
        y1 = np.array(y1)
        y2 = np.array(y2)

        history = self.model.model.fit(x,
                                       y={
                                           "action_output": y1,
                                           "Value_output": y2
                                       },
                                       validation_split=0.2,
                                       batch_size=self.args['batchSize'],
                                       epochs=self.args['epochs'],
                                       verbose=1,
                                       callbacks=[checkpoint])

        # print accurary of the best epoch
        self.model.model.load_weights(self.args['checkpointPath'])
Exemple #20
0
 def __init__(self, _id):
     super().__init__(_id)
     self.opponent_id = None
     self.tree = MCTS()