class LocalLBRWorker: """ Slave to EvalLBRMaster. Does the LBR computation as described in https://arxiv.org/abs/1612.07547 """ def __init__(self, t_prof, chief_handle, eval_agent_cls): assert t_prof.n_seats == 2 self.t_prof = t_prof self.lbr_args = t_prof.module_args["lbr"] self._eval_env_bldr = _util.get_env_builder_lbr(t_prof=t_prof) self.check_to_round = self.lbr_args.lbr_check_to_round self.chief_handle = chief_handle self.agent = _AgentWrapper(t_prof=t_prof, lbr_args=self.lbr_args, eval_agent_cls=eval_agent_cls) # has different raise sizes than agent's env! This needs to be considered when updating the envs after opp acts self._env = None self.agent_range = PokerRange(env_bldr=self._eval_env_bldr) assert self.check_to_round is None or ( self.check_to_round in self._eval_env_bldr.rules.ALL_ROUNDS_LIST) def run(self, agent_seat_id, n_iterations, mode, stack_size): """ returns an estimate of a lower bound of the exploitablity of the agent """ self.agent.set_mode(mode=mode) self.agent.to_stack_size(stack_size) self.agent_range.reset() self._env = self._eval_env_bldr.get_new_env(is_evaluating=True, stack_size=stack_size) if not self.agent.can_compute_mode(): return None if self._eval_env_bldr.env_cls.IS_FIXED_LIMIT_GAME: return self._run_limit(agent_seat_id=agent_seat_id, n_iterations=n_iterations) else: return self._run_no_limit(agent_seat_id=agent_seat_id, n_iterations=n_iterations) def update_weights(self, weights_for_eval_agent): self.agent.update_weights(weights_for_eval_agent) def _reset_episode(self): ret = self._env.reset() self.agent.reset(deck_state_dict=self._env.cards_state_dict()) self.agent_range.reset() return ret def _run_limit(self, agent_seat_id, n_iterations): total_lbr_winnings = np.empty(shape=n_iterations, dtype=np.float32) lbr_seat_id = 1 - agent_seat_id for iteration_id in range(n_iterations): if iteration_id % 50 == 0: print("LBR hand: ", iteration_id) # """"""""""""""""" # Reset # """"""""""""""""" env_obs, reward, terminal, info = self._reset_episode() lbr_hand = self._env.get_hole_cards_of_player(p_id=lbr_seat_id) self.agent_range.set_cards_to_zero_prob(cards_2d=lbr_hand) # """"""""""""""""" # Play Episode # """"""""""""""""" while not terminal: p_id_acting = self._env.current_player.seat_id if self.t_prof.DEBUGGING: assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id if p_id_acting == lbr_seat_id: # optional feature: check the first N rounds 100% as LBR if (self.check_to_round is not None) and ( self._env.current_round < self.check_to_round): action_int = Poker.CHECK_CALL else: _rollout_mngr = _LBRRolloutManager( t_prof=self.t_prof, env_bldr=self._eval_env_bldr, env=self._env, lbr_hand_2d=lbr_hand) # illegal: -1, fold: 0, all other: any float _utility = np.full(shape=3, fill_value=-1.0, dtype=np.float32) # ev(s, lbr_a=fold) _utility[Poker.FOLD] = 0.0 # ev(s, lbr_a=check_call) _wp = _rollout_mngr.get_lbr_checkdown_equity( agent_range=self.agent_range ) # if check/called down _asked = self._env.seats[ agent_seat_id].current_bet - self._env.seats[ lbr_seat_id].current_bet _pot_before_action = self._env.get_all_winnable_money() _utility[ Poker.CHECK_CALL] = _wp * _pot_before_action - ( 1 - _wp) * _asked # prepare for raise simulation if Poker.BET_RAISE in self._env.get_legal_actions(): _saved_env_state = self._env.state_dict() _saved_agent_env_state = self.agent.env_state_dict( ) _saved_agent_range_state = self.agent_range.state_dict( ) # compute ev for raise # _________________________________ simulate LBR play r ____________________________________ self._env.step(action=Poker.BET_RAISE) _pot_after_raise = self._env.get_all_winnable_money( ) self.agent.notify_of_action( p_id_acted=lbr_seat_id, action_he_did=Poker.BET_RAISE) # what agent would do after LBR raises. DOESN'T STEP INTERNAL ENV! _, a_probs_each_hand = self.agent.get_action( step_env=False, need_probs=True) # _______________________________ simulate agent reaction __________________________________ # p(agent_fold) _fold_prob = np.sum( self.agent_range.range * a_probs_each_hand[:, Poker.FOLD]) # p(not agent_fold | hand) _p_not_fold_per_hand = ( 1 - a_probs_each_hand[:, Poker.FOLD]) # agent_range after not folding self.agent_range.mul_and_norm(_p_not_fold_per_hand) # p(lbr_win | lbr play r -> agent play not fold) _wp_now = _rollout_mngr.get_lbr_checkdown_equity( agent_range=self.agent_range) # ev(state, lbr_a=r) _chips_lbr_puts_in_pot = _pot_after_raise - _pot_before_action _ev_if_fold = _pot_before_action _ev_if_not_fold = (_wp_now * _pot_after_raise) - ( (1 - _wp_now) * _chips_lbr_puts_in_pot) _utility[ Poker.BET_RAISE] = _fold_prob * _ev_if_fold + ( 1 - _fold_prob) * _ev_if_not_fold # ________________________________________ reset ___________________________________________ self.agent_range.load_state_dict( _saved_agent_range_state) self._env.load_state_dict(_saved_env_state) self.agent.load_env_state_dict( _saved_agent_env_state) # select action with highest approximated EV action_int = np.argmax(_utility) # ________________________________________ notify agent ____________________________________________ self.agent.notify_of_action(p_id_acted=lbr_seat_id, action_he_did=action_int) else: # agent has to act action_int, a_probs_each_hand = self.agent.get_action( step_env=True, need_probs=True) self.agent_range.update_after_action( action=action_int, all_a_probs_for_all_hands=a_probs_each_hand) # _____________________________________________ step ___________________________________________________ old_game_round = self._env.current_round env_obs, reward, terminal, info = self._env.step( action=action_int) if self._env.current_round != old_game_round: self.agent_range.update_after_new_round( new_round=self._env.current_round, board_now_2d=self._env.board) total_lbr_winnings[iteration_id] = reward[ lbr_seat_id] * self._env.REWARD_SCALAR * self._env.EV_NORMALIZER return total_lbr_winnings def _run_no_limit(self, agent_seat_id, n_iterations): total_lbr_winnings = np.empty(shape=n_iterations, dtype=np.float32) lbr_seat_id = 1 - agent_seat_id n_lbr_bets = len(self._env.bet_sizes_list_as_frac_of_pot) for iteration_id in range(n_iterations): if iteration_id % 50 == 0: print("LBR hand: ", iteration_id) # """"""""""""""""" # Reset # """"""""""""""""" env_obs, reward, done, info = self._reset_episode() lbr_hand = self._env.get_hole_cards_of_player(p_id=lbr_seat_id) self.agent_range.set_cards_to_zero_prob(cards_2d=lbr_hand) # """"""""""""""""" # Play Episode # """"""""""""""""" while not done: p_id_acting = self._env.current_player.seat_id if self.t_prof.DEBUGGING: assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id if p_id_acting == lbr_seat_id: # optional feature: check the first N rounds 100% as LBR if (self.check_to_round is not None) and ( self._env.current_round < self.check_to_round): action_int = Poker.CHECK_CALL else: _rollout_mngr = _LBRRolloutManager( t_prof=self.t_prof, env_bldr=self._eval_env_bldr, env=self._env, lbr_hand_2d=lbr_hand) # illegal: -1, fold: 0, all other: any float _utility = np.full(shape=2 + n_lbr_bets, fill_value=-1.0, dtype=np.float32) # ev(s, lbr_a=fold) _utility[Poker.FOLD] = 0.0 # ev(s, lbr_a=check_call) _wp = _rollout_mngr.get_lbr_checkdown_equity( agent_range=self.agent_range) _asked = self._env.seats[ agent_seat_id].current_bet - self._env.seats[ lbr_seat_id].current_bet _pot_before_action = self._env.get_all_winnable_money() _utility[ Poker.CHECK_CALL] = _wp * _pot_before_action - ( 1 - _wp) * _asked # prepare for raise simulation _saved_env_state = self._env.state_dict() _saved_agent_env_state = self.agent.env_state_dict() _saved_agent_range_state = self.agent_range.state_dict( ) _legal_raises = self._env.get_legal_actions() for a in [Poker.FOLD, Poker.CHECK_CALL]: if a in _legal_raises: _legal_raises.remove(a) # compute ev for all raise sizes LBR can choose from for r in _legal_raises: raise_frac = self._env.bet_sizes_list_as_frac_of_pot[ r - 2] # _________________________________ simulate LBR play r ____________________________________ self._env.step(action=r) _pot_after_raise = self._env.get_all_winnable_money( ) self.agent.notify_of_raise_frac_action( p_id_acted=lbr_seat_id, frac=raise_frac) if self.t_prof.DEBUGGING: assert agent_seat_id == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id # what agent would do after LBR raises. DOESN'T STEP INTERNAL ENV! a_probs_each_hand = self.agent.get_a_probs_for_each_hand( ) # _______________________________ simulate agent reaction __________________________________ # p(agent_fold) _fold_prob = np.sum( self.agent_range.range * a_probs_each_hand[:, Poker.FOLD]) # p(not agent_fold | hand) _p_not_fold_per_hand = ( 1 - a_probs_each_hand[:, Poker.FOLD]) # agent_range after not folding self.agent_range.mul_and_norm(_p_not_fold_per_hand) # p(lbr_win | lbr play r -> agent play not fold) _wp_now = _rollout_mngr.get_lbr_checkdown_equity( agent_range=self.agent_range) # ev(state, lbr_a=r) _chips_lbr_puts_in_pot = _pot_after_raise - _pot_before_action _ev_if_fold = _pot_before_action _ev_if_not_fold = (_wp_now * _pot_after_raise) - ( (1 - _wp_now) * _chips_lbr_puts_in_pot) _utility[r] = _fold_prob * _ev_if_fold + ( 1 - _fold_prob) * _ev_if_not_fold # ________________________________________ reset ___________________________________________ self.agent_range.load_state_dict( _saved_agent_range_state) self._env.load_state_dict(_saved_env_state) self.agent.load_env_state_dict( _saved_agent_env_state) # select action with highest approximated EV action_int = np.argmax(_utility) # ________________________________________ notify agent ____________________________________________ if action_int >= 2: raise_frac = self._env.bet_sizes_list_as_frac_of_pot[ action_int - 2] self.agent.notify_of_raise_frac_action( p_id_acted=lbr_seat_id, frac=raise_frac) else: self.agent.notify_of_action(p_id_acted=lbr_seat_id, action_he_did=action_int) else: # agent has to act if self.t_prof.DEBUGGING: assert p_id_acting == self.agent.cpu_agent._internal_env_wrapper.env.current_player.seat_id action_int, a_probs_each_hand = self.agent.get_action( step_env=True, need_probs=True) self.agent_range.update_after_action( action=action_int, all_a_probs_for_all_hands=a_probs_each_hand) if action_int >= 2: # querying what the bet size is in the agent's env_args (these might differ from LBR's!). raise_frac = \ self.agent.cpu_agent.env_bldr.env_args.bet_sizes_list_as_frac_of_pot[action_int - 2] # _____________________________________________ step ___________________________________________________ old_game_round = self._env.current_round if action_int >= 2: # step with fraction because agent and LBR have different raise sizes env_obs, reward, done, info = self._env.step_raise_pot_frac( pot_frac=raise_frac) else: env_obs, reward, done, info = self._env.step( action=action_int) if self._env.current_round != old_game_round: self.agent_range.update_after_new_round( new_round=self._env.current_round, board_now_2d=self._env.board) total_lbr_winnings[iteration_id] = reward[ lbr_seat_id] * self._env.REWARD_SCALAR * self._env.EV_NORMALIZER return total_lbr_winnings
def test_save_load(self): env_bldr = get_leduc_env_bldr() range_ = PokerRange(env_bldr=env_bldr) range_.load_state_dict(range_.state_dict())