Ejemplo n.º 1
0
    def __init__(self,
                 config: Config,
                 model,
                 play_config=None,
                 enable_resign=True):
        """

        :param config:
        :param reversi_zero.agent.model.ReversiModel model:
        """
        self.config = config
        self.model = model
        self.play_config = play_config or self.config.play
        self.enable_resign = enable_resign
        self.api = ReversiModelAPI(self.config, self.model)

        # key=(own, enemy, action)
        self.var_n = defaultdict(lambda: np.zeros((64, )))
        self.var_w = defaultdict(lambda: np.zeros((64, )))
        self.var_q = defaultdict(lambda: np.zeros((64, )))
        self.var_p = defaultdict(lambda: np.zeros((64, )))
        self.expanded = set()
        self.now_expanding = set()
        self.prediction_queue = Queue(self.play_config.prediction_queue_size)
        self.sem = asyncio.Semaphore(self.play_config.parallel_search_num)

        self.moves = []
        self.loop = asyncio.get_event_loop()
        self.running_simulation_num = 0

        self.thinking_history = {}  # for fun
        self.resigned = False
Ejemplo n.º 2
0
    def __init__(self,
                 config: Config,
                 model,
                 play_config=None,
                 enable_resign=True,
                 mtcs_info=None):
        """

        :param config:
        :param reversi_zero.agent.model.ReversiModel model:
        :param MCTSInfo mtcs_info:
        """
        self.config = config
        self.model = model
        self.play_config = play_config or self.config.play
        self.enable_resign = enable_resign
        self.api = ReversiModelAPI(self.config, self.model)

        # key=(own, enemy, action)
        mtcs_info = mtcs_info or self.create_mtcs_info()
        self.var_n, self.var_w, self.var_p = mtcs_info

        self.expanded = set()
        self.now_expanding = set()
        self.prediction_queue = Queue(self.play_config.prediction_queue_size)
        self.sem = asyncio.Semaphore(self.play_config.parallel_search_num)

        self.moves = []
        self.loop = asyncio.get_event_loop()
        self.running_simulation_num = 0
        self.callback_in_mtcs = None

        self.thinking_history = {}  # for fun
        self.resigned = False
        self.requested_stop_thinking = False
Ejemplo n.º 3
0
    def __init__(self, make_sim_env_fn, config=None, play_config=None, model=None):
        self.make_sim_env_fn = make_sim_env_fn
        self.config = config
        self.play_config = play_config or self.config.play
        self.root_node = Node(self.play_config.c_puct)
        self.model = model
        self.prediction_queue = Queue(self.play_config.prediction_queue_size)
        self.api = ReversiModelAPI(self.config, self.model)

        self.loop = asyncio.get_event_loop()
        self.running_simulation_num = 0
        self.expanding_nodes = set()
        self.locker = asyncio.Lock()
Ejemplo n.º 4
0
class ReversiPlayer:
    def __init__(self,
                 config: Config,
                 model,
                 play_config=None,
                 enable_resign=True):
        """

        :param config:
        :param reversi_zero.agent.model.ReversiModel model:
        """
        self.config = config
        self.model = model
        self.play_config = play_config or self.config.play
        self.enable_resign = enable_resign
        self.api = ReversiModelAPI(self.config, self.model)

        # key=(own, enemy, action)
        self.var_n = defaultdict(lambda: np.zeros((64, )))
        self.var_w = defaultdict(lambda: np.zeros((64, )))
        self.var_q = defaultdict(lambda: np.zeros((64, )))
        self.var_p = defaultdict(lambda: np.zeros((64, )))
        self.expanded = set()
        self.now_expanding = set()
        self.prediction_queue = Queue(self.play_config.prediction_queue_size)
        self.sem = asyncio.Semaphore(self.play_config.parallel_search_num)

        self.moves = []
        self.loop = asyncio.get_event_loop()
        self.running_simulation_num = 0

        self.thinking_history = {}  # for fun
        self.resigned = False

    def action(self, own, enemy):
        """

        :param own: BitBoard
        :param enemy:  BitBoard
        :return: action: move pos=0 ~ 63 (0=top left, 7 top right, 63 bottom right)
        """
        env = ReversiEnv().update(own, enemy, Player.black)
        key = self.counter_key(env)

        for tl in range(self.play_config.thinking_loop):
            if tl > 0 and self.play_config.logging_thinking:
                logger.debug(
                    f"continue thinking: policy move=({action % 8}, {action // 8}), "
                    f"value move=({action_by_value % 8}, {action_by_value // 8})"
                )
            self.search_moves(own, enemy)
            policy = self.calc_policy(own, enemy)
            action = int(np.random.choice(range(64), p=policy))
            action_by_value = int(
                np.argmax(self.var_q[key] + (self.var_n[key] > 0) * 100))
            if action == action_by_value or env.turn < self.play_config.change_tau_turn or env.turn <= 1:
                break

        # this is for play_gui, not necessary when training.
        next_key = self.get_next_key(own, enemy, action)
        self.thinking_history[(own, enemy)] = HistoryItem(
            action, policy, list(self.var_q[key]), list(self.var_n[key]),
            list(self.var_q[next_key]), list(self.var_n[next_key]))

        if self.play_config.resign_threshold is not None and \
                        np.max(self.var_q[key] - (self.var_n[key] == 0)*10) <= self.play_config.resign_threshold:
            self.resigned = True
            if self.enable_resign:
                return None  # means resign

        saved_policy = self.calc_policy_by_tau_1(
            key) if self.config.play_data.save_policy_of_tau_1 else policy
        self.add_data_to_move_buffer_with_8_symmetries(own, enemy,
                                                       saved_policy)
        return action

    def add_data_to_move_buffer_with_8_symmetries(self, own, enemy, policy):
        for flip in [False, True]:
            for rot_right in range(4):
                own_saved, enemy_saved, policy_saved = own, enemy, policy.reshape(
                    (8, 8))
                if flip:
                    own_saved = flip_vertical(own_saved)
                    enemy_saved = flip_vertical(enemy_saved)
                    policy_saved = np.flipud(policy_saved)
                if rot_right:
                    for _ in range(rot_right):
                        own_saved = rotate90(own_saved)
                        enemy_saved = rotate90(enemy_saved)
                    policy_saved = np.rot90(policy_saved, k=-rot_right)
                self.moves.append([(own_saved, enemy_saved),
                                   list(policy_saved.reshape((64, )))])

    def get_next_key(self, own, enemy, action):
        env = ReversiEnv().update(own, enemy, Player.black)
        env.step(action)
        return self.counter_key(env)

    def ask_thought_about(self, own, enemy) -> HistoryItem:
        return self.thinking_history.get((own, enemy))

    def search_moves(self, own, enemy):
        loop = self.loop
        self.running_simulation_num = 0

        coroutine_list = []
        for it in range(self.play_config.simulation_num_per_move):
            cor = self.start_search_my_move(own, enemy)
            coroutine_list.append(cor)

        coroutine_list.append(self.prediction_worker())
        loop.run_until_complete(asyncio.gather(*coroutine_list))

    async def start_search_my_move(self, own, enemy):
        self.running_simulation_num += 1
        with await self.sem:  # reduce parallel search number
            env = ReversiEnv().update(own, enemy, Player.black)
            leaf_v = await self.search_my_move(env, is_root_node=True)
            self.running_simulation_num -= 1
            return leaf_v

    async def search_my_move(self, env: ReversiEnv, is_root_node=False):
        """

        Q, V is value for this Player(always black).
        P is value for the player of next_player (black or white)
        :param env:
        :param is_root_node:
        :return:
        """
        if env.done:
            if env.winner == Winner.black:
                return 1
            elif env.winner == Winner.white:
                return -1
            else:
                return 0

        key = self.counter_key(env)

        while key in self.now_expanding:
            await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec)

        # is leaf?
        if key not in self.expanded:  # reach leaf node
            leaf_v = await self.expand_and_evaluate(env)
            if env.next_player == Player.black:
                return leaf_v  # Value for black
            else:
                return -leaf_v  # Value for white == -Value for black

        action_t = self.select_action_q_and_u(env, is_root_node)
        _, _ = env.step(action_t)

        virtual_loss = self.config.play.virtual_loss
        virtual_loss_for_w = virtual_loss if env.next_player == Player.black else -virtual_loss
        self.var_n[key][action_t] += virtual_loss
        self.var_w[key][action_t] -= virtual_loss_for_w
        self.var_q[key][
            action_t] = self.var_w[key][action_t] / self.var_n[key][action_t]
        leaf_v = await self.search_my_move(env)  # next move

        # on returning search path
        # update: N, W, Q
        n = self.var_n[key][
            action_t] = self.var_n[key][action_t] - virtual_loss + 1
        w = self.var_w[key][
            action_t] = self.var_w[key][action_t] + virtual_loss_for_w + leaf_v
        self.var_q[key][action_t] = w / n
        return leaf_v

    async def expand_and_evaluate(self, env):
        """expand new leaf

        update var_p, return leaf_v

        :param ReversiEnv env:
        :return: leaf_v
        """

        key = self.counter_key(env)
        self.now_expanding.add(key)

        black, white = env.board.black, env.board.white

        # (di(p), v) = fθ(di(sL))
        # rotation and flip. flip -> rot.
        is_flip_vertical = random() < 0.5
        rotate_right_num = int(random() * 4)
        if is_flip_vertical:
            black, white = flip_vertical(black), flip_vertical(white)
        for i in range(rotate_right_num):
            black, white = rotate90(black), rotate90(
                white)  # rotate90: rotate bitboard RIGHT 1 time

        black_ary = bit_to_array(black, 64).reshape((8, 8))
        white_ary = bit_to_array(white, 64).reshape((8, 8))
        state = [
            black_ary, white_ary
        ] if env.next_player == Player.black else [white_ary, black_ary]
        future = await self.predict(np.array(state))  # type: Future
        await future
        leaf_p, leaf_v = future.result()

        # reverse rotate and flip about leaf_p
        if rotate_right_num > 0 or is_flip_vertical:  # reverse rotation and flip. rot -> flip.
            leaf_p = leaf_p.reshape((8, 8))
            if rotate_right_num > 0:
                leaf_p = np.rot90(
                    leaf_p,
                    k=rotate_right_num)  # rot90: rotate matrix LEFT k times
            if is_flip_vertical:
                leaf_p = np.flipud(leaf_p)
            leaf_p = leaf_p.reshape((64, ))

        self.var_p[key] = leaf_p  # P is value for next_player (black or white)
        self.expanded.add(key)
        self.now_expanding.remove(key)
        return float(leaf_v)

    async def prediction_worker(self):
        """For better performance, queueing prediction requests and predict together in this worker.

        speed up about 45sec -> 15sec for example.
        :return:
        """
        q = self.prediction_queue
        margin = 10  # avoid finishing before other searches starting.
        while self.running_simulation_num > 0 or margin > 0:
            if q.empty():
                if margin > 0:
                    margin -= 1
                await asyncio.sleep(
                    self.config.play.prediction_worker_sleep_sec)
                continue
            item_list = [q.get_nowait()
                         for _ in range(q.qsize())]  # type: list[QueueItem]
            # logger.debug(f"predicting {len(item_list)} items")
            data = np.array([x.state for x in item_list])
            policy_ary, value_ary = self.api.predict(data)
            for p, v, item in zip(policy_ary, value_ary, item_list):
                item.future.set_result((p, v))

    async def predict(self, x):
        future = self.loop.create_future()
        item = QueueItem(x, future)
        await self.prediction_queue.put(item)
        return future

    def finish_game(self, z):
        """

        :param z: win=1, lose=-1, draw=0
        :return:
        """
        for move in self.moves:  # add this game winner result to all past moves.
            move += [z]

    def calc_policy(self, own, enemy):
        """calc π(a|s0)

        :param own:
        :param enemy:
        :return:
        """
        pc = self.play_config
        env = ReversiEnv().update(own, enemy, Player.black)
        key = self.counter_key(env)
        if env.turn < pc.change_tau_turn:
            return self.calc_policy_by_tau_1(key)
        else:
            action = np.argmax(self.var_n[key])  # tau = 0
            ret = np.zeros(64)
            ret[action] = 1
            return ret

    def calc_policy_by_tau_1(self, key):
        return self.var_n[key] / np.sum(self.var_n[key])  # tau = 1

    @staticmethod
    def counter_key(env: ReversiEnv):
        return CounterKey(env.board.black, env.board.white,
                          env.next_player.value)

    def select_action_q_and_u(self, env, is_root_node):
        key = self.counter_key(env)
        if env.next_player == Player.black:
            legal_moves = find_correct_moves(key.black, key.white)
        else:
            legal_moves = find_correct_moves(key.white, key.black)
        # noinspection PyUnresolvedReferences
        xx_ = np.sqrt(np.sum(
            self.var_n[key]))  # SQRT of sum(N(s, b); for all b)
        xx_ = max(xx_, 1)  # avoid u_=0 if N is all 0
        p_ = self.var_p[key]

        if is_root_node:  # Is it correct?? -> (1-e)p + e*Dir(alpha)
            p_ = (1 - self.play_config.noise_eps) * p_ + \
                 self.play_config.noise_eps * np.random.dirichlet([self.play_config.dirichlet_alpha] * 64)

        # re-normalize in legal moves
        p_ = p_ * bit_to_array(legal_moves, 64)
        if np.sum(p_) > 0:
            p_ = p_ / np.sum(p_)

        u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key])
        if env.next_player == Player.black:
            v_ = (self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)
        else:
            # When enemy's selecting action, flip Q-Value.
            v_ = (-self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)

        # noinspection PyTypeChecker
        action_t = int(np.argmax(v_))
        return action_t
Ejemplo n.º 5
0
class GameTree(object):
    def __init__(self, make_sim_env_fn, config=None, play_config=None, model=None):
        self.make_sim_env_fn = make_sim_env_fn
        self.config = config
        self.play_config = play_config or self.config.play
        self.root_node = Node(self.play_config.c_puct)
        self.model = model
        self.prediction_queue = Queue(self.play_config.prediction_queue_size)
        self.api = ReversiModelAPI(self.config, self.model)

        self.loop = asyncio.get_event_loop()
        self.running_simulation_num = 0
        self.expanding_nodes = set()
        self.locker = asyncio.Lock()

    def expand_root(self, root_env, dir_noise=None):
        ps, vs = self.api.predict(np.asarray(root_env.observation), np.asarray(root_env.legal_moves))
        self.root_node.expand_and_evaluate(ps, vs, root_env.legal_moves)
        if dir_noise:
            self.root_node.add_dirichlet_noise(self.play_config.noise_eps, self.play_config.dirichlet_alpha)

    def mcts_and_play(self, tau):
        self.mcts()
        return self.play(tau)

    def keep_only_subtree(self, action):
        root_node = self.root_node.child_by_value(action)
        if root_node == None:
            root_node = self.root_node
        assert root_node is not None, f'root node has {len(self.root_node.children)} child  action = {action}'
        self.root_node = root_node

    def mcts(self):
        self.running_simulation_num = self.play_config.simulation_num_per_move
        coroutine_list = []
        start_time = time.time()
        for it in range(self.play_config.simulation_num_per_move):
            coroutine_list.append(self.simulate())
        coroutine_list.append(self.prediction_worker())
        self.loop.run_until_complete(asyncio.gather(*coroutine_list))
        # print('search mcts in time %.2f' % (time.time() - start_time))
    async def simulate(self):
        leaf_v = await self.simulate_internal()
        await self.prediction_queue.put(None)
        return leaf_v

    async def simulate_internal(self):
        assert self.root_node.expanded

        virtual_loss = self.config.play.virtual_loss
        env = self.make_sim_env_fn()
        cur_node = self.root_node
        while True:
            next_node = await self.select_next_or_expand(env, cur_node)
            if next_node is None:
                # cur_node is expanded leaf node
                leaf_node = cur_node
                v = leaf_node.v
                break
            env.step(next_node.value)
            if env.done:
                leaf_node = next_node
                v = 1 if env.last_player_wins else -1 if env.last_player_loses else 0
                v = -v
                leaf_node.v = float(v)
                break
            # select next node
            cur_node = next_node
        # backup
        cur_node = leaf_node
        while cur_node is not self.root_node:
            v = -v  # important: reverse v
            parent = cur_node.parent
            with await parent.locker:
                parent.backup(v, virtual_loss, cur_node.sibling_index)

            cur_node = parent
        return -v  # v for root node

    async def select_next_or_expand(self, env, node):
        with await node.locker:
            if node.expanded:
                # select node
                if node.passed:
                    return node.children[0]
                ci = random.choice(node.best_children_indices)
                next_node = node.children[ci]
                virtual_loss = self.config.play.virtual_loss
                node.add_virtual_loss(virtual_loss, next_node.sibling_index)
                return next_node
            # expand node
            ob, legal_moves, rotate_flip_op = np.asarray(env.observation), np.asarray(env.legal_moves), None
            env_legal_moves = legal_moves
            if env.rotate_flip_op_count > 0:
                rotate_flip_op = random.randint(0, env.rotate_flip_op_count - 1)
                ob = env.rotate_flip_ob(ob, rotate_flip_op)
                legal_moves = env.rotate_flip_pi(legal_moves, rotate_flip_op)

            future = await self.predict(ob, legal_moves)
            await future
            p, v = future.result()

            if rotate_flip_op is not None:
                p = env.counter_rotate_flip_pi(p, rotate_flip_op)

            node.expand_and_evaluate(p, v, env_legal_moves)
            return None

    async def prediction_worker(self):
        q = self.prediction_queue
        while self.running_simulation_num > 0:
            item_list = []
            item = await q.get()
            if item is None:
                self.running_simulation_num -= 1
            else:
                item_list.append(item)
            while not q.empty():
                try:
                    item = q.get_nowait()
                    if item is None:
                        self.running_simulation_num -= 1
                        continue
                    item_list.append(item)
                except QueueEmpty:
                    break
            if len(item_list) == 0:
                continue
            start_time = time.time()
            data = np.array([x.state for x in item_list])
            legal_moves = np.array([x.legal_moves for x in item_list])
            policy_ary, value_ary = self.api.predict(data, legal_moves)  # policy_ary: [n, 64], value_ary: [n, 1]
            for p, v, item in zip(policy_ary, value_ary, item_list):
                item.future.set_result((p, v))
            # print('prediction worker process %d stats in time %.2f' % (len(item_list), time.time() - start_time))

    async def predict(self, x, legal_moves):
        future = self.loop.create_future()
        item = QueueItem(self, x, legal_moves, future)
        await self.prediction_queue.put(item)
        return future

    # those illegal actions are with full_N == 0, so won't be played
    def play(self, tau):
        if self.root_node.passed:
            pi = np.zeros([self.root_node._full_n_size])
            act = 64
        else:
            N = self.root_node.full_N
            if abs(tau-1) < 1e-10:
                pi = N / np.sum(N)
                act = np.random.choice(range(len(pi)), p=pi)
                assert pi[act] > 0
            else:
                assert abs(tau) < 1e-10, f'tau={tau}(expected to be either 0 or 1 only)'
                act = random.choice(np.argwhere(abs(N - np.amax(N)) < 1e-10).flatten().tolist())
                pi = np.zeros([len(N)])
                pi[act] = 1

        # the paper says, AGZ resigns if both root value and best child value are lower than threshold
        # TODO: is it v or Q or Q+U to check?
        root_v = self.root_node.v
        # child'v is opponent's winning rate, need to reverse
        # Note that root_node.children are only for those legal action.
        children_v = [-child.v for child in self.root_node.children]
        if len(children_v) > 0:
            best_child_v = np.max(children_v)
        else:
            best_child_v = root_v  # trick. Since it is for resign_check only, it works to let be root_v.
        values_of_resign_check = (root_v, best_child_v)

        return int(act), pi, values_of_resign_check