def get_action(self, step_env=True, need_probs=False): """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """ p_id_acting = self._internal_env_wrapper.env.current_player.seat_id range_idx = self._internal_env_wrapper.env.get_range_idx( p_id=p_id_acting) if self._mode == self.EVAL_MODE_AVG: if need_probs: # only do if rly necessary a_probs_all_hands = self.get_a_probs_for_each_hand() a_probs = a_probs_all_hands[range_idx] else: a_probs_all_hands = None # not needed a_probs = self.policies[p_id_acting].get_a_probs( pub_obses=[ self._internal_env_wrapper.get_current_obs( use_canonical=self.use_canonical, p_id=p_id_acting) ], range_idxs=np.array([range_idx], dtype=np.int32), legal_actions_lists=[ self._internal_env_wrapper.env.get_legal_actions() ])[0] action = np.random.choice(np.arange(self.env_bldr.N_ACTIONS), p=a_probs) if step_env: self._internal_env_wrapper.step(action=action) return action, a_probs_all_hands else: raise UnknownModeError(self._mode)
def get_action(self, step_env=True, need_probs=False): """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """ p_id_acting = self._internal_env_wrapper.env.current_player.seat_id range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting) # """""""""""""""""""""""""""" # Deep CFR # """""""""""""""""""""""""""" if self._mode == self.EVAL_MODE_AVRG_NET: if need_probs: # only do if necessary a_probs_all_hands = self.get_a_probs_for_each_hand() a_probs = a_probs_all_hands[range_idx] else: a_probs_all_hands = None # not needed a_probs = self.avrg_net_policies[p_id_acting].get_a_probs( pub_obses=[self._internal_env_wrapper.get_current_obs()], range_idxs=np.array([range_idx], dtype=np.int32), legal_actions_lists=[self._internal_env_wrapper.env.get_legal_actions()] )[0] action = np.random.choice(np.arange(self.env_bldr.N_ACTIONS), p=a_probs) if step_env: self._internal_env_wrapper.step(action=action) return action, a_probs_all_hands # """""""""""""""""""""""""""" # SD-CFR # """""""""""""""""""""""""""" elif self._mode == self.EVAL_MODE_SINGLE: if need_probs: a_probs_all_hands = self.get_a_probs_for_each_hand() else: a_probs_all_hands = None # not needed legal_actions_list = self._internal_env_wrapper.env.get_legal_actions() if self._episode_net_idxs[p_id_acting] is None: # Iteration 0 action = legal_actions_list[np.random.randint(len(legal_actions_list))] else: # Iteration > 0 action = self._strategy_buffers[p_id_acting].get(self._episode_net_idxs[p_id_acting]).get_action( pub_obses=[self._internal_env_wrapper.get_current_obs()], range_idxs=[range_idx], legal_actions_lists=[legal_actions_list], )[0].item() if step_env: # add to history before modifying env state self._add_history_entry(p_id_acting=p_id_acting, action_hes_gonna_do=action) # make INTERNAL step to keep up with the game state. self._internal_env_wrapper.step(action=action) return action, a_probs_all_hands else: raise UnknownModeError(self._mode)
def get_a_probs_for_each_hand(self): """ BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! """ p_id_acting = self._internal_env_wrapper.env.current_player.seat_id if self._mode == self.EVAL_MODE_AVG: return self.policies[p_id_acting].get_a_probs_for_each_hand( pub_obs=self._internal_env_wrapper.get_current_obs(), legal_actions_list=self._internal_env_wrapper.env.get_legal_actions()) else: raise UnknownModeError(self._mode)
def get_a_probs(self): pub_obs = self._internal_env_wrapper.get_current_obs() legal_actions_list = self._internal_env_wrapper.env.get_legal_actions() p_id_acting = self._internal_env_wrapper.env.current_player.seat_id range_idx = self._internal_env_wrapper.env.get_range_idx( p_id=p_id_acting) # """""""""""""""""""""""""""" # Deep CFR # """""""""""""""""""""""""""" if self._mode == self.EVAL_MODE_AVRG_NET: return self.avrg_net_policies[p_id_acting].get_a_probs( pub_obses=[pub_obs], range_idxs=np.array([range_idx], dtype=np.int32), legal_actions_lists=[legal_actions_list])[0] # """""""""""""""""""""""""""" # SD-CFR # """""""""""""""""""""""""""" elif self._mode == self.EVAL_MODE_SINGLE: if self._strategy_buffers[p_id_acting].size == 0: unif_rand_legal = np.full( shape=self.env_bldr.N_ACTIONS, fill_value=1.0 / len(legal_actions_list) ) * rl_util.get_legal_action_mask_np( n_actions=self.env_bldr.N_ACTIONS, legal_actions_list=legal_actions_list, dtype=np.float32) return unif_rand_legal else: # """"""""""""""""""""" # Weighted by Iteration # """""""""""""""""""""" # Dim: [model_idx, action_p] a_probs_each_model = np.array([ weight * strat.get_a_probs( pub_obses=[pub_obs], range_idxs=[range_idx], legal_actions_lists=[legal_actions_list])[0] for strat, weight in self._strategy_buffers[p_id_acting]. get_strats_and_weights() ]) # """""""""""""""""""""" # Weighted by Reach # """""""""""""""""""""" #print(a_probs_each_model.shape) #print(self._get_reach_for_each_model(p_id_acting=p_id_acting, range_idx=range_idx,).shape) a_probs_each_model *= np.expand_dims( self._get_reach_for_each_model( p_id_acting=p_id_acting, range_idx=range_idx, ), axis=1) # """""""""""""""""""""" # Normalize # """""""""""""""""""""" # Dim: [action_p] a_probs = np.sum(a_probs_each_model, axis=0) # Dim: [] a_probs_sum = np.sum(a_probs) # Dim: [action_p] return a_probs / a_probs_sum else: raise UnknownModeError(self._mode)
def get_a_probs_for_each_hand(self): """ BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! """ pub_obs = self._internal_env_wrapper.get_current_obs() legal_actions_list = self._internal_env_wrapper.env.get_legal_actions() p_id_acting = self._internal_env_wrapper.env.current_player.seat_id # """""""""""""""""""""""""""" # Deep CFR # """""""""""""""""""""""""""" if self._mode == self.EVAL_MODE_AVRG_NET: return self.avrg_net_policies[ p_id_acting].get_a_probs_for_each_hand( pub_obs=pub_obs, legal_actions_list=legal_actions_list) # """""""""""""""""""""""""""" # SD-CFR # """""""""""""""""""""""""""" elif self._mode == self.EVAL_MODE_SINGLE: unif_rand_legal = np.full( shape=self.env_bldr.N_ACTIONS, fill_value=1.0 / len(legal_actions_list)) * rl_util.get_legal_action_mask_np( n_actions=self.env_bldr.N_ACTIONS, legal_actions_list=legal_actions_list, dtype=np.float32) n_models = self._strategy_buffers[p_id_acting].size if n_models == 0: return np.repeat(np.expand_dims(unif_rand_legal, axis=0), repeats=self.env_bldr.rules.RANGE_SIZE, axis=0) else: # Dim: [model_idx, range_idx] reaches = self._get_reach_for_each_model_each_hand( p_id_acting=p_id_acting) # """""""""""""""""""""" # Compute strategy for # all infosets with # reach >0. Initialize # All others stay unif. # """""""""""""""""""""" contrib_each_model = np.zeros( shape=(n_models, self.env_bldr.rules.RANGE_SIZE, self.env_bldr.N_ACTIONS), dtype=NP_FLOAT_TYPE) for m_i, (strat, weight) in enumerate( self._strategy_buffers[p_id_acting]. get_strats_and_weights()): range_idxs = np.nonzero(reaches[m_i])[0] if range_idxs.shape[0] > 0: a_probs_m = strat.get_a_probs_for_each_hand_in_list( pub_obs=pub_obs, range_idxs=range_idxs, legal_actions_list=legal_actions_list) contrib_each_model[m_i, range_idxs] = a_probs_m * weight # Dim: [range_idx, action_p] a_probs = (np.sum(contrib_each_model * np.expand_dims(reaches, axis=2), axis=0)).astype(NP_FLOAT_TYPE) # Dim: [range_idx] a_probs_sum = np.expand_dims(np.sum(a_probs, axis=1), axis=1) # Dim: [range_idx, action_p] with np.errstate(divide='ignore', invalid='ignore'): return np.where( a_probs_sum == 0, np.repeat( np.expand_dims(unif_rand_legal, axis=0), repeats=self._internal_env_wrapper.env.RANGE_SIZE, axis=0), a_probs / a_probs_sum) else: raise UnknownModeError(self._mode)