コード例 #1
0
ファイル: LocalLBRWorker.py プロジェクト: zxpower/PokerRL
class LocalLBRWorker:
    """
    Slave to EvalLBRMaster. Does the LBR computation as described in https://arxiv.org/abs/1612.07547
    """
    def __init__(self, t_prof, chief_handle, eval_agent_cls):
        assert t_prof.n_seats == 2

        self.t_prof = t_prof
        self.lbr_args = t_prof.module_args["lbr"]
        self._eval_env_bldr = _util.get_env_builder_lbr(t_prof=t_prof)
        self.check_to_round = self.lbr_args.lbr_check_to_round

        self.chief_handle = chief_handle

        self.agent = _AgentWrapper(t_prof=t_prof,
                                   lbr_args=self.lbr_args,
                                   eval_agent_cls=eval_agent_cls)

        # has different raise sizes than agent's env! This needs to be considered when updating the envs after opp acts
        self._env = None
        self.agent_range = PokerRange(env_bldr=self._eval_env_bldr)

        assert self.check_to_round is None or (
            self.check_to_round in self._eval_env_bldr.rules.ALL_ROUNDS_LIST)

    def run(self, agent_seat_id, n_iterations, mode, stack_size):
        """ returns an estimate of a lower bound of the exploitablity of the agent """

        self.agent.set_mode(mode=mode)
        self.agent.to_stack_size(stack_size)
        self.agent_range.reset()

        self._env = self._eval_env_bldr.get_new_env(is_evaluating=True,
                                                    stack_size=stack_size)

        if not self.agent.can_compute_mode():
            return None

        if self._eval_env_bldr.env_cls.IS_FIXED_LIMIT_GAME:
            return self._run_limit(agent_seat_id=agent_seat_id,
                                   n_iterations=n_iterations)
        else:
            return self._run_no_limit(agent_seat_id=agent_seat_id,
                                      n_iterations=n_iterations)

    def update_weights(self, weights_for_eval_agent):
        self.agent.update_weights(weights_for_eval_agent)

    def _reset_episode(self):
        ret = self._env.reset()
        self.agent.reset(deck_state_dict=self._env.cards_state_dict())
        self.agent_range.reset()
        return ret

    def _run_limit(self, agent_seat_id, n_iterations):
        total_lbr_winnings = np.empty(shape=n_iterations, dtype=np.float32)
        lbr_seat_id = 1 - agent_seat_id

        for iteration_id in range(n_iterations):
            if iteration_id % 50 == 0:
                print("LBR hand: ", iteration_id)

            # """""""""""""""""
            # Reset
            # """""""""""""""""
            env_obs, reward, terminal, info = self._reset_episode()

            lbr_hand = self._env.get_hole_cards_of_player(p_id=lbr_seat_id)
            self.agent_range.set_cards_to_zero_prob(cards_2d=lbr_hand)

            # """""""""""""""""
            # Play Episode
            # """""""""""""""""
            while not terminal:
                p_id_acting = self._env.current_player.seat_id

                if self.t_prof.DEBUGGING:
                    assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id

                if p_id_acting == lbr_seat_id:
                    # optional feature: check the first N rounds 100% as LBR
                    if (self.check_to_round is not None) and (
                            self._env.current_round < self.check_to_round):
                        action_int = Poker.CHECK_CALL

                    else:
                        _rollout_mngr = _LBRRolloutManager(
                            t_prof=self.t_prof,
                            env_bldr=self._eval_env_bldr,
                            env=self._env,
                            lbr_hand_2d=lbr_hand)

                        # illegal: -1, fold: 0, all other: any float
                        _utility = np.full(shape=3,
                                           fill_value=-1.0,
                                           dtype=np.float32)

                        # ev(s, lbr_a=fold)
                        _utility[Poker.FOLD] = 0.0

                        # ev(s, lbr_a=check_call)
                        _wp = _rollout_mngr.get_lbr_checkdown_equity(
                            agent_range=self.agent_range
                        )  # if check/called down
                        _asked = self._env.seats[
                            agent_seat_id].current_bet - self._env.seats[
                                lbr_seat_id].current_bet
                        _pot_before_action = self._env.get_all_winnable_money()
                        _utility[
                            Poker.CHECK_CALL] = _wp * _pot_before_action - (
                                1 - _wp) * _asked

                        # prepare for raise simulation
                        if Poker.BET_RAISE in self._env.get_legal_actions():
                            _saved_env_state = self._env.state_dict()
                            _saved_agent_env_state = self.agent.env_state_dict(
                            )
                            _saved_agent_range_state = self.agent_range.state_dict(
                            )

                            # compute ev for raise
                            # _________________________________ simulate LBR play r ____________________________________
                            self._env.step(action=Poker.BET_RAISE)
                            _pot_after_raise = self._env.get_all_winnable_money(
                            )

                            self.agent.notify_of_action(
                                p_id_acted=lbr_seat_id,
                                action_he_did=Poker.BET_RAISE)

                            # what agent would do after LBR raises. DOESN'T STEP INTERNAL ENV!
                            _, a_probs_each_hand = self.agent.get_action(
                                step_env=False, need_probs=True)

                            # _______________________________ simulate agent reaction __________________________________
                            # p(agent_fold)
                            _fold_prob = np.sum(
                                self.agent_range.range *
                                a_probs_each_hand[:, Poker.FOLD])

                            # p(not agent_fold | hand)
                            _p_not_fold_per_hand = (
                                1 - a_probs_each_hand[:, Poker.FOLD])

                            # agent_range after not folding
                            self.agent_range.mul_and_norm(_p_not_fold_per_hand)

                            # p(lbr_win | lbr play r -> agent play not fold)
                            _wp_now = _rollout_mngr.get_lbr_checkdown_equity(
                                agent_range=self.agent_range)

                            # ev(state, lbr_a=r)
                            _chips_lbr_puts_in_pot = _pot_after_raise - _pot_before_action
                            _ev_if_fold = _pot_before_action
                            _ev_if_not_fold = (_wp_now * _pot_after_raise) - (
                                (1 - _wp_now) * _chips_lbr_puts_in_pot)
                            _utility[
                                Poker.BET_RAISE] = _fold_prob * _ev_if_fold + (
                                    1 - _fold_prob) * _ev_if_not_fold

                            # ________________________________________ reset ___________________________________________
                            self.agent_range.load_state_dict(
                                _saved_agent_range_state)
                            self._env.load_state_dict(_saved_env_state)
                            self.agent.load_env_state_dict(
                                _saved_agent_env_state)

                        # select action with highest approximated EV
                        action_int = np.argmax(_utility)

                    # ________________________________________ notify agent ____________________________________________
                    self.agent.notify_of_action(p_id_acted=lbr_seat_id,
                                                action_he_did=action_int)

                else:  # agent has to act
                    action_int, a_probs_each_hand = self.agent.get_action(
                        step_env=True, need_probs=True)
                    self.agent_range.update_after_action(
                        action=action_int,
                        all_a_probs_for_all_hands=a_probs_each_hand)

                # _____________________________________________ step ___________________________________________________
                old_game_round = self._env.current_round

                env_obs, reward, terminal, info = self._env.step(
                    action=action_int)

                if self._env.current_round != old_game_round:
                    self.agent_range.update_after_new_round(
                        new_round=self._env.current_round,
                        board_now_2d=self._env.board)

            total_lbr_winnings[iteration_id] = reward[
                lbr_seat_id] * self._env.REWARD_SCALAR * self._env.EV_NORMALIZER

        return total_lbr_winnings

    def _run_no_limit(self, agent_seat_id, n_iterations):
        total_lbr_winnings = np.empty(shape=n_iterations, dtype=np.float32)
        lbr_seat_id = 1 - agent_seat_id
        n_lbr_bets = len(self._env.bet_sizes_list_as_frac_of_pot)

        for iteration_id in range(n_iterations):
            if iteration_id % 50 == 0:
                print("LBR hand: ", iteration_id)

            # """""""""""""""""
            # Reset
            # """""""""""""""""
            env_obs, reward, done, info = self._reset_episode()

            lbr_hand = self._env.get_hole_cards_of_player(p_id=lbr_seat_id)
            self.agent_range.set_cards_to_zero_prob(cards_2d=lbr_hand)

            # """""""""""""""""
            # Play Episode
            # """""""""""""""""
            while not done:
                p_id_acting = self._env.current_player.seat_id

                if self.t_prof.DEBUGGING:
                    assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id

                if p_id_acting == lbr_seat_id:

                    # optional feature: check the first N rounds 100% as LBR
                    if (self.check_to_round is not None) and (
                            self._env.current_round < self.check_to_round):
                        action_int = Poker.CHECK_CALL

                    else:
                        _rollout_mngr = _LBRRolloutManager(
                            t_prof=self.t_prof,
                            env_bldr=self._eval_env_bldr,
                            env=self._env,
                            lbr_hand_2d=lbr_hand)

                        # illegal: -1, fold: 0, all other: any float
                        _utility = np.full(shape=2 + n_lbr_bets,
                                           fill_value=-1.0,
                                           dtype=np.float32)

                        # ev(s, lbr_a=fold)
                        _utility[Poker.FOLD] = 0.0

                        # ev(s, lbr_a=check_call)
                        _wp = _rollout_mngr.get_lbr_checkdown_equity(
                            agent_range=self.agent_range)
                        _asked = self._env.seats[
                            agent_seat_id].current_bet - self._env.seats[
                                lbr_seat_id].current_bet
                        _pot_before_action = self._env.get_all_winnable_money()
                        _utility[
                            Poker.CHECK_CALL] = _wp * _pot_before_action - (
                                1 - _wp) * _asked

                        # prepare for raise simulation
                        _saved_env_state = self._env.state_dict()
                        _saved_agent_env_state = self.agent.env_state_dict()
                        _saved_agent_range_state = self.agent_range.state_dict(
                        )
                        _legal_raises = self._env.get_legal_actions()
                        for a in [Poker.FOLD, Poker.CHECK_CALL]:
                            if a in _legal_raises:
                                _legal_raises.remove(a)

                        # compute ev for all raise sizes LBR can choose from
                        for r in _legal_raises:
                            raise_frac = self._env.bet_sizes_list_as_frac_of_pot[
                                r - 2]

                            # _________________________________ simulate LBR play r ____________________________________
                            self._env.step(action=r)
                            _pot_after_raise = self._env.get_all_winnable_money(
                            )

                            self.agent.notify_of_raise_frac_action(
                                p_id_acted=lbr_seat_id, frac=raise_frac)

                            if self.t_prof.DEBUGGING:
                                assert agent_seat_id == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id

                            # what agent would do after LBR raises. DOESN'T STEP INTERNAL ENV!
                            a_probs_each_hand = self.agent.get_a_probs_for_each_hand(
                            )

                            # _______________________________ simulate agent reaction __________________________________
                            # p(agent_fold)
                            _fold_prob = np.sum(
                                self.agent_range.range *
                                a_probs_each_hand[:, Poker.FOLD])

                            # p(not agent_fold | hand)
                            _p_not_fold_per_hand = (
                                1 - a_probs_each_hand[:, Poker.FOLD])

                            # agent_range after not folding
                            self.agent_range.mul_and_norm(_p_not_fold_per_hand)

                            # p(lbr_win | lbr play r -> agent play not fold)
                            _wp_now = _rollout_mngr.get_lbr_checkdown_equity(
                                agent_range=self.agent_range)

                            # ev(state, lbr_a=r)
                            _chips_lbr_puts_in_pot = _pot_after_raise - _pot_before_action
                            _ev_if_fold = _pot_before_action
                            _ev_if_not_fold = (_wp_now * _pot_after_raise) - (
                                (1 - _wp_now) * _chips_lbr_puts_in_pot)
                            _utility[r] = _fold_prob * _ev_if_fold + (
                                1 - _fold_prob) * _ev_if_not_fold

                            # ________________________________________ reset ___________________________________________
                            self.agent_range.load_state_dict(
                                _saved_agent_range_state)
                            self._env.load_state_dict(_saved_env_state)
                            self.agent.load_env_state_dict(
                                _saved_agent_env_state)

                        # select action with highest approximated EV
                        action_int = np.argmax(_utility)

                    # ________________________________________ notify agent ____________________________________________
                    if action_int >= 2:
                        raise_frac = self._env.bet_sizes_list_as_frac_of_pot[
                            action_int - 2]
                        self.agent.notify_of_raise_frac_action(
                            p_id_acted=lbr_seat_id, frac=raise_frac)
                    else:
                        self.agent.notify_of_action(p_id_acted=lbr_seat_id,
                                                    action_he_did=action_int)

                else:  # agent has to act
                    if self.t_prof.DEBUGGING:
                        assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id

                    action_int, a_probs_each_hand = self.agent.get_action(
                        step_env=True, need_probs=True)

                    self.agent_range.update_after_action(
                        action=action_int,
                        all_a_probs_for_all_hands=a_probs_each_hand)
                    if action_int >= 2:
                        # querying what the bet size is in the agent's env_args (these might differ from LBR's!).
                        raise_frac = \
                            self.agent.cpu_agent.env_bldr.env_args.bet_sizes_list_as_frac_of_pot[action_int - 2]

                # _____________________________________________ step ___________________________________________________
                old_game_round = self._env.current_round

                if action_int >= 2:  # step with fraction because agent and LBR have different raise sizes
                    env_obs, reward, done, info = self._env.step_raise_pot_frac(
                        pot_frac=raise_frac)
                else:
                    env_obs, reward, done, info = self._env.step(
                        action=action_int)

                if self._env.current_round != old_game_round:
                    self.agent_range.update_after_new_round(
                        new_round=self._env.current_round,
                        board_now_2d=self._env.board)

            total_lbr_winnings[iteration_id] = reward[
                lbr_seat_id] * self._env.REWARD_SCALAR * self._env.EV_NORMALIZER

        return total_lbr_winnings
コード例 #2
0
ファイル: test_rangeManager.py プロジェクト: zxpower/PokerRL
 def test_save_load(self):
     env_bldr = get_leduc_env_bldr()
     range_ = PokerRange(env_bldr=env_bldr)
     range_.load_state_dict(range_.state_dict())