Example #1
0
    def select_best_move(self, stats, depth, board, color):
        """Select the best move at the end of the Monte Carlo tree search"""

        bestscore = 0
        bestmove = None
        total_n = 0

        for action in SP.available_actions(board):
            next_board = board[:]
            SP.play(next_board, action, color)
            n, w = stats[MCTSRandomPlayer.to_board_id(next_board)]
            if n == 0:
                continue
            total_n += n
            if self.DEBUG:
                print('Move %d score: %d/%d (%0.1f%%)' %
                      (action, w, n, w / n * 100))
            if n > bestscore or (n == bestscore and random.random() <= 0.5
                                 ):  # 가장 많이 방문해 본 길을 따라 간다. WHY???
                bestmove = action
                bestscore = n

        assert bestmove is not None

        if self.DEBUG:
            print('Maximum depth: %d, Total simulations: %d on %d' %
                  (depth, total_n, MCTSRandomPlayer.to_board_id(board)))

        return bestmove
Example #2
0
    def negamax(self, state, color, depth=10):
        ''' implement negamax algorithm
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        # Transposition Table related work
        (state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)

        cache = self.tp.get(_id)
        if cache is not None:  # BUG FIX! cache can be 0, so should check None
            # case 1
            # return cache
            # case 2
            return cache[0], random.choice(cache[1])

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_actions = []
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax(next_s, SP.next(color), depth - 1)
                score = -score  # negamax

            # pick from all best moves
            if score > best_score:
                best_score = score
                best_actions = [action]
            elif score == best_score:
                best_actions.append(action)

        # case 1: choose random value 1 time
        # choosed_result = random.choice(best_scores)
        # tp.put(_id, choosed_result)
        # return choosed_result

        # case 2: choose random value every time
        self.tp.put(_id, (best_score, best_actions))
        return (best_score, random.choice(best_actions))
Example #3
0
def find_next(board, color, seq):
    actions = SP.available_actions(board)

    for action in actions:
        new_board = board[:]
        reward, done = SP.play(new_board, action, color)
        if done == True:
            # print it?
            if reward == 0:
                print(seq + str(action), '=', OB.board_to_id(new_board))
            else:
                print(seq + str(action), MARKER[color],
                      OB.board_to_id(new_board))
        else:
            find_next(new_board, SP.next(color), seq + str(action))
Example #4
0
    def search(self, board, start_color, simulations, C):
        ''' implement monte carlo tree search algorithm
        '''
        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        stats = self._stats
        root = board
        max_depth = 0

        for _ in range(simulations):
            node = root[:]
            states = []

            # select leaf node
            depth = 0
            done = False
            color = start_color
            while not done:
                depth += 1
                action, select = self.select_next_move(stats, node, color, C)

                reward, done = SP.play(node, action, color)
                color = SP.next(color)

                states.append(MCTSRandomPlayer.to_board_id(node))

                if not select:
                    break

            max_depth = max(depth, max_depth)

            # run simulation if not at the end of the game tree
            if not done:
                result = self.simulate(node, start_color)  # TODO: 여기를 어떻게 할지
            else:
                if reward == 0:
                    result = 0.5
                else:
                    result = 0

            # propagate results
            for state in reversed(states):
                result = 1 - result
                stats[state][0] += 1
                stats[state][1] += result

        return stats, max_depth
Example #5
0
    def simulate(self, board, start_color):
        # random simulator ( Light playouts )
        node = board[:]
        done = False
        color = start_color
        while not done:
            actions = SP.available_actions(node)

            reward, done = SP.play(node, random.choice(actions), color)
            color = SP.next(color)

        if reward == 0:  # TIE
            return 0.5
        elif color == start_color:
            return 1
        else:
            return 0
Example #6
0
    def select_next_move(self, stats, board, color, C):
        """Select the next state and consider if it should be expanded (UCT)"""

        bestscore = None
        bestmove = None

        # my_id = MCTSRandomPlayer.to_board_id(board)

        children = []
        for action in SP.available_actions(board):
            # clone and play mode - can be play and rollback mode
            next_board = board[:]
            SP.play(next_board, action, color)
            children.append(
                (action, stats[MCTSRandomPlayer.to_board_id(next_board)]))

        total_n = sum(x[0] for (_, x) in children)

        for child_move, child_stat in children:
            n, w = child_stat
            if n == 0:  # 한번도 안가봤으면 가보자!
                return child_move, False
            else:  # 승률이 높고 (exploitation), 가장 적게 가본 곳이 좋은 곳 (exploration)
                score = (w / n) + C * math.sqrt(2 * math.log(total_n) / n)
                # if my_id == 70645:
                #     print("CHECK IN ", my_id, child_move, w, n, score, bestscore, next_id)
                # if next_id == 119797:
                #     print("JUMP IN ", my_id, child_move, w, n, score, bestscore, next_id)
                if bestscore is None or score > bestscore:
                    bestscore = score
                    bestmove = child_move

        # if my_id == 70645:
        #     print("SELECTED", bestmove, bestscore)

        assert bestmove is not None
        return bestmove, True
Example #7
0
    def negamax_alpha_beta_pruning(self,
                                   state,
                                   color,
                                   alpha=-math.inf,
                                   beta=math.inf,
                                   depth=10):
        ''' implement negamax algorithm with alpha-beta purning
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        orig_alpha = alpha

        # Transposition Table related work
        # ob = OptimalBoard(state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)
        cache = self.tp.get(_id)
        if cache and cache['depth'] >= depth:
            (cached_score, cached_action) = cache['value']
            if cache['flag'] == self.tp.EXACT:
                return (cached_score, cached_action)
            elif cache['flag'] == self.tp.LOWERBOUND:
                alpha = max(alpha, cached_score)
            elif cache['flag'] == self.tp.UPPERBOUND:
                beta = min(beta, cached_score)
            if alpha >= beta:
                return cached_score, cached_action
        # else:
        #     print("MISS", t.seq)

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_move = -1
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax_alpha_beta_pruning(next_s,
                                                           SP.next(color),
                                                           alpha=-beta,
                                                           beta=-alpha,
                                                           depth=depth - 1)
                score = -score  # negamax

            # just pick up 1 first best move (random.shuffle make randomness)
            if best_score < score or (score == best_score
                                      and random.random() < 0.5):
                best_score = score
                best_move = action

            if alpha < score:
                alpha = score
                # 결국 alpha = max(alpha, best_score)
                if alpha > beta:
                    break

        if best_score <= orig_alpha:
            flag = self.tp.UPPERBOUND
        elif best_score >= beta:
            flag = self.tp.LOWERBOUND
        else:
            flag = self.tp.EXACT

        self.tp.put(key=_id,
                    depth=depth,
                    value=(best_score, best_move),
                    flag=flag)

        return (alpha, best_move)