def _get_nl_leduc_tree(env_args=None): if env_args is None: env_args = DiscretizedNLLeduc.ARGS_CLS( n_seats=2, starting_stack_sizes_list=[1000, 1000], bet_sizes_list_as_frac_of_pot=[1.0]) env_bldr = HistoryEnvBuilder(env_cls=DiscretizedNLLeduc, env_args=env_args) _tree = PublicTree( env_bldr=env_bldr, stack_size=env_args.starting_stack_sizes_list, stop_at_street=None, ) _tree.build_tree() for p in range(env_bldr.N_SEATS): _tree.fill_uniform_random() _tree.compute_ev() _tree.export_to_file() print("Tree with stack size", _tree.stack_size, "has", _tree.n_nodes, "nodes out of which", _tree.n_nonterm, "are non-terminal.") print(np.mean(_tree.root.exploitability) * env_bldr.env_cls.EV_NORMALIZER) return _tree
def __init__(self, t_prof, br_agent, mode=None, device=None): super().__init__(t_prof=t_prof, mode=mode, device=device) self.tree = PublicTree( env_bldr=rl_util.get_env_builder(t_prof=t_prof), stack_size=t_prof.eval_stack_sizes[0], stop_at_street=None, put_out_new_round_after_limit=True, is_debugging=t_prof.DEBUGGING ) self.tree.build_tree() self.br_agent = br_agent # agent to play best response against self.solve_br() self.modes = ["EVAL", "BR", "BAYESIAN"] if mode: self.mode = mode else: self.mode = "EVAL" # default is eval if self.mode == "BAYESIAN": self._fill_tree_w_prior()
def __init__(self, t_prof, chief_handle, eval_agent_cls): super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle, eval_type="BR") self._env_bldr = rl_util.get_env_builder(t_prof=t_prof) assert self._env_bldr.N_SEATS == 2 self._eval_agent = eval_agent_cls(t_prof=t_prof) self._game_trees = [ PublicTree(env_bldr=self._env_bldr, stack_size=stack_size, stop_at_street=None, put_out_new_round_after_limit=True, is_debugging=self._t_prof.DEBUGGING) for stack_size in self._t_prof.eval_stack_sizes ] for gt in self._game_trees: gt.build_tree() print("Tree with stack size", gt.stack_size, "has", gt.n_nodes, "nodes out of which", gt.n_nonterm, "are non-terminal.")
def _evaluate_avg_strats(self): expl_totals = [] for t_idx in range(len(self._trees)): METRIC = self._env_bldrs[t_idx].env_cls.WIN_METRIC eval_tree = PublicTree( env_bldr=self._env_bldrs[t_idx], stack_size=self._env_args[t_idx].starting_stack_sizes_list, stop_at_street=None, is_debugging=False, ) eval_tree.build_tree() def _fill(_node_eval, _node_train): if _node_eval.p_id_acting_next != eval_tree.CHANCE_ID and ( not _node_eval.is_terminal): _node_eval.strategy = np.copy( _node_train.data["avg_strat"]) assert np.allclose(np.sum(_node_eval.strategy, axis=1), 1, atol=0.0001) for c_eval, c_train in zip(_node_eval.children, _node_train.children): _fill(_node_eval=c_eval, _node_train=c_train) # sets up some stuff; we overwrite strategy afterwards eval_tree.fill_uniform_random() # fill with strat _fill(_node_eval=eval_tree.root, _node_train=self._trees[t_idx].root) eval_tree.update_reach_probs() # compute EVs eval_tree.compute_ev() eval_tree.export_to_file(name=self._name + "_Avg_" + str(self._iter_counter)) # log expl_p = [ float(eval_tree.root.exploitability[p]) * self._env_bldrs[t_idx].env_cls.EV_NORMALIZER for p in range(eval_tree.n_seats) ] expl_total = sum(expl_p) / eval_tree.n_seats expl_totals.append(expl_total) self._chief_handle.add_scalar(self._exps_avg_total[t_idx], "Evaluation/" + METRIC, self._iter_counter, expl_total) expl_total_averaged = sum(expl_totals) / float(len(expl_totals)) self._chief_handle.add_scalar(self._exp_all_averaged_avg_total, "Evaluation/" + METRIC, self._iter_counter, expl_total_averaged)
def __init__( self, name, chief_handle, game_cls, agent_bet_set, algo_name, starting_stack_sizes=None, ): """ Args: name (str): Under this name all logs, data, and checkpoints will appear. chief_handle (ChiefBase): Reference to chief worker game_cls (PokerEnv subclass): Class (not instance) to be trained in. agent_bet_set (iterable): Choosing a bet-set from bet_sets.py is recommended. If solving a Limit poker game, this value will not be considered, but must still be passed. Just set this to any list of floats (e.g. [0.0]) starting_stack_sizes (list of ints): For each stack size in this list, a CFR strategy will be computed. Results are logged individually and averaged (uniform). If None, takes the default for the game. """ self._name = name self._n_seats = 2 self._chief_handle = chief_handle if starting_stack_sizes is None: self._starting_stack_sizes = [game_cls.DEFAULT_STACK_SIZE] else: self._starting_stack_sizes = copy.deepcopy(starting_stack_sizes) self._game_cls_str = game_cls.__name__ self._env_args = [ game_cls.ARGS_CLS( n_seats=self._n_seats, starting_stack_sizes_list=[ start_chips for _ in range(self._n_seats) ], bet_sizes_list_as_frac_of_pot=agent_bet_set, ) for start_chips in self._starting_stack_sizes ] self._env_bldrs = [ HistoryEnvBuilder(env_cls=get_env_cls_from_str(self._game_cls_str), env_args=self._env_args[s]) for s in range(len(self._starting_stack_sizes)) ] self._trees = [ PublicTree( env_bldr=self._env_bldrs[idx], stack_size=self._env_args[idx].starting_stack_sizes_list, stop_at_street=None) for idx in range(len(self._env_bldrs)) ] for tree in self._trees: tree.build_tree() print("Tree with stack size", tree.stack_size, "has", tree.n_nodes, "nodes out of which", tree.n_nonterm, "are non-terminal.") self._algo_name = algo_name self._exps_curr_total = [ self._chief_handle.create_experiment( self._name + "_Curr_S" + str(self._starting_stack_sizes[s]) + "_total_" + self._algo_name) for s in range(len(self._starting_stack_sizes)) ] self._exps_avg_total = [ self._chief_handle.create_experiment( self._name + "_Avg_total_S" + str(self._starting_stack_sizes[s]) + "_" + self._algo_name) for s in range(len(self._starting_stack_sizes)) ] self._exp_all_averaged_curr_total = self._chief_handle.create_experiment( self._name + "_Curr_total_averaged_" + self._algo_name) self._exp_all_averaged_avg_total = self._chief_handle.create_experiment( self._name + "_Avg_total_averaged_" + self._algo_name) self._iter_counter = None
class EvalAgentTree(_EvalAgentBase): def __init__(self, t_prof, br_agent, mode=None, device=None): super().__init__(t_prof=t_prof, mode=mode, device=device) self.tree = PublicTree( env_bldr=rl_util.get_env_builder(t_prof=t_prof), stack_size=t_prof.eval_stack_sizes[0], stop_at_street=None, put_out_new_round_after_limit=True, is_debugging=t_prof.DEBUGGING ) self.tree.build_tree() self.br_agent = br_agent # agent to play best response against self.solve_br() self.modes = ["EVAL", "BR", "BAYESIAN"] if mode: self.mode = mode else: self.mode = "EVAL" # default is eval if self.mode == "BAYESIAN": self._fill_tree_w_prior() def _fill_tree_w_prior(self, prior=1): def fill(node): node.data = np.ones((6,3)) #len(node.allowed_actions) for child in node.children: fill(child) node = self.tree.root fill(node) def can_compute_mode(self): """ All modes are always computable (i.e. not dependent on iteration etc.)""" return True def _find_node_by_env(self, action_history): node = self.tree.root """ envw = self._internal_env_wrapper.env_bldr i = 0 last_round_ = None round_ = node.env_state['current_round'] p_id = node.p_id_acting_next if last_round_ == round_: nth_action_this_round += 1 else: last_round_ = round_ nth_action_this_round = 0 def reverse_idx(idx, round_, p_id, nth_action_this_round): return i - nth_action_this_round - p_id * envw._VEC_HALF_ROUND_SIZE[round_] - envw._VEC_ROUND_OFFSETS[round_] while i < len(action_history): if action_history[i] == 1: action_idx = reverse_idx(i, round_, p_id, nth_action_this_round) + 1 # fold never accepted node = node.children[action_idx] #recurse through tree """ i = 0 while i < len(action_history): if isinstance(node.children[0], PlayerActionNode): #next node is playerAction action = action_history[i][0] assert(node.p_id_acting_next == action_history[i][2]) node = node.children[node.allowed_actions.index(action)] i += 1 else: #chance node, flop assert(node.children[0].action == "CHANCE") card = self._internal_env_wrapper.env.board node = node.children[self._card_to_idx(card)] assert(self._card_to_idx(node.env_state['board_2d']) == self._card_to_idx(card)) if not isinstance(node.children[0], PlayerActionNode): # just need to do one more loop lol assert(node.children[0].action == "CHANCE") card = self._internal_env_wrapper.env.board node = node.children[self._card_to_idx(card)] assert(self._card_to_idx(node.env_state['board_2d']) == self._card_to_idx(card)) return node def _card_to_idx(self, card): return card[0][0] * 2 + card[0][1] def solve_br(self): self.tree.fill_with_agent_policy(agent=self.br_agent) self.tree.compute_ev() def get_action(self, step_env=True, need_probs=False): """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """ # print("action history", self._internal_env_wrapper._action_history_vector) # node = self._find_node_by_env(self._internal_env_wrapper._action_history_vector) # print("action history", self._internal_env_wrapper._action_history_list) node = self._find_node_by_env(self._internal_env_wrapper._action_history_list) p_id_acting = self._internal_env_wrapper.env.current_player.seat_id range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting) legal_actions_list = self._internal_env_wrapper.env.get_legal_actions() a_probs_all_hands = None if self.mode == "BR": action = None best_ev = -1e10 #really bad for idx, potential_action in enumerate(node.allowed_actions): if node.children[idx].ev[p_id_acting,range_idx] > best_ev: action = potential_action # deterministic best_ev = node.children[idx].ev[p_id_acting,range_idx] elif self.mode == "EVAL": a_probs = node.strategy[range_idx,:] # print(node.strategy, node.strategy.shape) # print(node.allowed_actions) # print("allowed:", legal_actions_list) # print(node.ev, node.ev_br) action = np.random.choice(node.allowed_actions, p=a_probs) elif self.mode == "BAYESIAN": psuedocounts = node.data[range_idx,node.allowed_actions] a_probs = psuedocounts/sum(psuedocounts) action = np.random.choice(node.allowed_actions, p=a_probs) """ self.solve_br() action = None best_ev = -1e10 #really bad for idx, potential_action in enumerate(node.allowed_actions): if node.children[idx].ev[p_id_acting,range_idx] > best_ev: action = potential_action # deterministic best_ev = node.children[idx].ev[p_id_acting,range_idx] """ if step_env: self._internal_env_wrapper.step(action=action) assert(a_probs_all_hands is None) return action, a_probs_all_hands def get_a_probs_for_each_hand(self): ## BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! node = self._find_node_by_env(self._internal_env_wrapper._action_history_list) legal_actions_list = self._internal_env_wrapper.env.get_legal_actions() if self.mode == "BAYESIAN": x = node.data mask = np.ones(x.shape,dtype=bool) #np.ones_like(a,dtype=bool) mask[:,node.allowed_actions] = False x[mask] = 0 return x / x.sum(axis=1)[:, np.newaxis] def get_mode(self): if self.mode == "BR": return "BESTRESPONSE" elif self.mode == "EVAL": return "COPYCAT" elif self.mode == "BAYESIAN": return "BAYESIAN"