Example #1
0
 async def start_search_my_move(self, board):
     self.running_simulation_num += 1
     with await self.sem:  # reduce parallel search number
         env = Connect4Env().update(board)
         leaf_v = await self.search_my_move(env, is_root_node=True)
         self.running_simulation_num -= 1
         return leaf_v
    def action(self, board):

        env = Connect4Env().update(board)
        key = self.counter_key(env)

        for tl in range(self.play_config.thinking_loop):
            if tl > 0 and self.play_config.logging_thinking:
                logger.debug(
                    f"continue thinking: policy move=({action % 8}, {action // 8}), "
                    f"value move=({action_by_value % 8}, {action_by_value // 8})"
                )
            self.search_moves(board)
            policy = self.calc_policy(board)
            action = int(np.random.choice(
                range(self.labels_n),
                p=policy))  #p is weight, choose action by weight.
            action_by_value = int(
                np.argmax(self.var_q[key] + (self.var_n[key] > 0) * 100))
            if action == action_by_value or env.turn < self.play_config.change_tau_turn:
                break

        # this is for play_gui, not necessary when training.
        self.thinking_history[env.observation] = HistoryItem(
            action, policy, list(self.var_q[key]), list(self.var_n[key]))

        self.moves.append([env.observation, list(policy)])
        return action
Example #3
0
def start(config: Config):
    PlayWithHumanConfig().update_play_config(config.play)
    connect4_model = PlayWithHuman(config)

    while True:
        env = Connect4Env().reset()
        human_is_black = random() < 0.5
        connect4_model.start_game(human_is_black)

        while not env.done:
            if env.player_turn() == Player.black:
                if not human_is_black:
                    action = connect4_model.move_by_ai(env)
                    print("IA moves to: " + str(action + 1))
                else:
                    action = connect4_model.move_by_human(env)
                    print("You move to: " + str(action + 1))
            else:
                if human_is_black:
                    action = connect4_model.move_by_ai(env)
                    print("IA moves to: " + str(action + 1))
                else:
                    action = connect4_model.move_by_human(env)
                    print("You move to: " + str(action + 1))
            env.step(action)
            env.render()

        print("\nEnd of the game.")
        print("Game result:")
        if env.winner == Winner.white:
            print("X wins")
        elif env.winner == Winner.black:
            print("O wins")
        else:
            print("Game was a draw")
    def play_game(self, best_model, ng_model):
        env = Connect4Env().reset()

        best_player = Connect4Player(self.config,
                                     best_model,
                                     play_config=self.config.eval.play_config)
        ng_player = Connect4Player(self.config,
                                   ng_model,
                                   play_config=self.config.eval.play_config)
        best_is_white = random() < 0.5
        if not best_is_white:
            black, white = best_player, ng_player
        else:
            black, white = ng_player, best_player

        env.reset()
        while not env.done:
            if env.player_turn() == Player.black:
                action = black.action(env.board)
            else:
                action = white.action(env.board)
            env.step(action)

        ng_win = None
        if env.winner == Winner.white:
            if best_is_white:
                ng_win = 0
            else:
                ng_win = 1
        elif env.winner == Winner.black:
            if best_is_white:
                ng_win = 1
            else:
                ng_win = 0
        return ng_win, best_is_white
    async def search_my_move(self, env: Connect4Env, is_root_node=False):
        """

        Q, V is value for this Player(always white).
        P is value for the player of next_player (black or white)
        :param env:
        :param is_root_node:
        :return:
        """
        if env.done:
            if env.winner == Winner.white:
                return 1
            elif env.winner == Winner.black:
                return -1
            else:
                return 0

        key = self.counter_key(env)

        while key in self.now_expanding:
            await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec)

        # is leaf?
        if key not in self.expanded:  # reach leaf node
            leaf_v = await self.expand_and_evaluate(env)
            if env.player_turn() == Player.white:
                return leaf_v  # Value for white
            else:
                return -leaf_v  # Value for white == -Value for white

        action_t = self.select_action_q_and_u(env, is_root_node)
        _, _ = env.step(action_t)

        virtual_loss = self.config.play.virtual_loss  #进一步探索之前,使用virtual_loss降低当前路径被再次探索的概率
        self.var_n[key][action_t] += virtual_loss
        self.var_w[key][action_t] -= virtual_loss
        leaf_v = await self.search_my_move(env)  # next move

        # on returning search path
        # update: N, W, Q, U
        n = self.var_n[key][
            action_t] = self.var_n[key][action_t] - virtual_loss + 1
        w = self.var_w[key][
            action_t] = self.var_w[key][action_t] + virtual_loss + leaf_v
        self.var_q[key][action_t] = w / n  #备份Q(s,a)
        return leaf_v
Example #6
0
 def calc_policy(self, board):
     """calc π(a|s0)
     :return:
     """
     pc = self.play_config
     env = Connect4Env().update(board)
     key = self.counter_key(env)
     if env.turn < pc.change_tau_turn:
         return self.var_n[key] / np.sum(self.var_n[key])  # tau = 1
     else:
         action = np.argmax(self.var_n[key])  # tau = 0
         ret = np.zeros(self.labels_n)
         ret[action] = 1
         return ret
    def convert_to_training_data(data):
        """

        :param data: format is SelfPlayWorker.buffer
        :return:
        """
        state_list = []
        policy_list = []
        z_list = []
        for state, policy, z in data:
            board = list(state)
            board = np.reshape(board, (6, 7))
            env = Connect4Env().update(board)

            black_ary, white_ary = env.black_and_white_plane()
            state = [
                black_ary, white_ary
            ] if env.player_turn() == Player.black else [white_ary, black_ary]

            state_list.append(state)
            policy_list.append(policy)
            z_list.append(z)

        return np.array(state_list), np.array(policy_list), np.array(z_list)
Example #8
0
def start(config: Config):
    tf_util.set_session_config(per_process_gpu_memory_fraction=0.2)
    return SelfPlayWorker(config, env=Connect4Env()).start()