Beispiel #1
0
    def sim_once(self, s0):
        s = copy.deepcopy(s0)
        node = self._root
        while True:
            legal_states, who, legal_moves = Game.possible_moves(s)
            if len(legal_states) == 0:
                return None, None

            if node.is_leaf():
                return node, s
            else:
                move, node = node.select()
                s = self.make_a_move(s, move, who)
Beispiel #2
0
    def _evaluate_rollout(self, state, limit):
        # _, player, legal_moves = Game.possible_moves(state)
        winner = 0

#         old_board = Board()
#         old_board.stones = state
        player = None
        for i in range(limit):
            legal_states, p, legal_moves = Game.possible_moves(state)
            if player is None:
                player = p
            if len(legal_states) == 0:
                break

            probs = self._rollout(state, legal_moves)
            mask = np.full_like(probs, -0.01)
            mask[:, legal_moves] = probs[:, legal_moves]
            probs = mask

            best_move = np.argmax(probs, 1)[0]

            idx = np.where(legal_moves == best_move)[0]
#             if idx.size == 0:
#                 print(i, idx)
#                 print(best_move)
#                 print(probs.shape)
#                 print(legal_moves)
#                 print(probs)
            assert idx.size == 1
            idx = idx[0]
            st1 = legal_states[idx]

            over, winner, last_loc = st1.is_over(state)
            if over:
                break

            state = st1
        else:
            # If no break from the loop, issue a warning.
            print("WARNING: rollout reached move limit")

        if winner == 0:
            return 0
        else:
            return 1 if winner == player else -1
Beispiel #3
0
    def _evaluate_rollout(self, state, limit):
        # _, player, legal_moves = Game.possible_moves(state)
        winner = 0

        #         old_board = Board()
        #         old_board.stones = state
        player = None
        for i in range(limit):
            legal_states, p, legal_moves = Game.possible_moves(state)
            if player is None:
                player = p
            if len(legal_states) == 0:
                break

            probs = self._rollout(state, legal_moves)
            mask = np.full_like(probs, -0.01)
            mask[:, legal_moves] = probs[:, legal_moves]
            probs = mask

            best_move = np.argmax(probs, 1)[0]

            idx = np.where(legal_moves == best_move)[0]
            #             if idx.size == 0:
            #                 print(i, idx)
            #                 print(best_move)
            #                 print(probs.shape)
            #                 print(legal_moves)
            #                 print(probs)
            assert idx.size == 1
            idx = idx[0]
            st1 = legal_states[idx]

            over, winner, last_loc = st1.is_over(state)
            if over:
                break

            state = st1
        else:
            # If no break from the loop, issue a warning.
            print("WARNING: rollout reached move limit")

        if winner == 0:
            return 0
        else:
            return 1 if winner == player else -1
Beispiel #4
0
    def _playout(self, state, leaf_depth):
        start_time = time.time()
        node = self._root

        print('exploit')
        for i in range(leaf_depth):
            #             print()
            legal_states, _, legal_moves = Game.possible_moves(state)
            #             print(state)
            #             print(legal_moves)
            #             print('depth:', i, 'legal moves:', legal_moves.shape)

            if len(legal_states) == 0:
                break
            if node.is_leaf():
                action_probs = self._policy(state)
                if len(action_probs) == 0:
                    break
#                 print('num of action-prob:', len(action_probs))
                node.expand(action_probs)

#             print('num of children:', len(node._children))
            best_move, node = node.select()
            idx = np.where(legal_moves == best_move)[0]
            if idx.size == 0:
                print('depth:', i, idx)
                print('best move:', best_move)
                #                 print(legal_moves)
                p = node.parent
                for a, s1 in p.children.items():
                    print('  ', a, s1.get_value())

            assert idx.size == 1
            state = legal_states[idx[0]]

#         duration = time.time() - start_time
#         print('time cost:', duration)
        print('rollout...')
        v = self._value(state) if self._lmbda < 1 else 0
        z = self._evaluate_rollout(
            state, self._rollout_limit) if self._lmbda > 0 else 0
        leaf_value = (1 - self._lmbda) * v + self._lmbda * z

        node.update_recursive(leaf_value, self._c_puct)
Beispiel #5
0
    def _playout(self, state, leaf_depth):
        # start_time = time.time()
        node = self._root

        print('exploit')
        for i in range(leaf_depth):
            legal_states, _, legal_moves = Game.possible_moves(state)
#             print(state)
#             print(legal_moves)
#             print('depth:', i, 'legal moves:', legal_moves.shape)

            if len(legal_states) == 0:
                break
            if node.is_leaf():
                action_probs = self._policy(state)
                if len(action_probs) == 0:
                    break
#                 print('num of action-prob:', len(action_probs))
                node.expand(action_probs)

#             print('num of children:', len(node._children))
            best_move, node = node.select()
            idx = np.where(legal_moves == best_move)[0]
            if idx.size == 0:
                print('depth:', i, idx)
                print('best move:', best_move)
#                 print(legal_moves)
                p = node.parent
                for a, s1 in p.children.items():
                    print('  ', a, s1.get_value())

            assert idx.size == 1
            state = legal_states[idx[0]]

#         duration = time.time() - start_time
#         print('time cost:', duration)
        print('rollout...')
        v = self._value(state) if self._lmbda < 1 else 0
        z = self._evaluate_rollout(state, self._rollout_limit) if self._lmbda > 0 else 0
        leaf_value = (1 - self._lmbda) * v + self._lmbda * z

        node.update_recursive(leaf_value, self._c_puct)
Beispiel #6
0
    def sim(self, board):
        visited_path = []
        state = board
        winner = Board.STONE_EMPTY
        for _ in range(1, self.max_moves + 1):
            moves, player, _ = Game.possible_moves(state)
            state_new, state_new_val = self.get_best(state, moves, player)
            visited_path.append((player, state, state_new, state_new_val))
            over, winner, _ = state_new.is_over(state)
            if over:
                break
            state = state_new

        self.total_sim += 1

        ds = SupervisedDataSet(self.features_num, 2)
        for player, state, new, val in visited_path:
            plays = val[1] * self.total_sim + 1
            wins = val[0] * self.total_sim
            if player == winner:
                wins += 1
            ds.addSample(self.get_input_values(state, new, player), (wins, plays))
        self.trainer.trainOnDataset(ds)
Beispiel #7
0
    def sim(self, board):
        visited_path = []
        state = board
        winner = Board.STONE_EMPTY
        for _ in range(1, self.max_moves + 1):
            moves, player = Game.possible_moves(state)
            state_new, state_new_val = self.get_best(state, moves, player)
            visited_path.append((player, state, state_new, state_new_val))
            over, winner, _ = state_new.is_over(state)
            if over:
                break
            state = state_new

        self.total_sim += 1

        ds = SupervisedDataSet(self.features_num, 2)
        for player, state, new, val in visited_path:
            plays = val[1] * self.total_sim + 1
            wins = val[0] * self.total_sim
            if player == winner:
                wins += 1
            ds.addSample(self.get_input_values(state, new, player),
                         (wins, plays))
        self.trainer.trainOnDataset(ds)
Beispiel #8
0
 def _policy_fn(self, board):
     _, _, legal_moves = Game.possible_moves(board)
     state, _ = self.get_input_values(board.stones)
     probs = self.brain.get_move_probs(state)
     probs = probs[0, legal_moves]
     return list(zip(legal_moves, probs))
Beispiel #9
0
 def _policy_fn(self, board):
     _, _, legal_moves = Game.possible_moves(board)
     state, _ = self.get_input_values(board.stones)
     probs = self.brain.get_move_probs(state)
     probs = probs[0, legal_moves]
     return list(zip(legal_moves, probs))