def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) python_policy = policy.UniformRandomPolicy(game) pyspiel_policy = pyspiel.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): python_br = best_response.BestResponsePolicy( game, current_player, python_policy) cpp_br = pyspiel.TabularBestResponse( game, current_player, pyspiel_policy).get_best_response_policy() for state in all_states.values(): if state.current_player() != current_player: continue # TODO(b/141737795): Decide what to do about this. self.assertEqual( python_br.action_probabilities(state), { a: prob for a, prob in cpp_br.action_probabilities( state).items() if prob != 0 })
def __init__(self, best_response_backend='cpp', game=None, all_states=None, state_to_information_state=None, **kwargs): """Init function for the RLOracle. Args: best_response_backend: A string (either 'cpp' or 'py'), specifying the best response backend to use (C++ or python, respectively). The cpp backend should be preferred, generally, as it is significantly faster. game: The game on which the optimization process takes place. all_states: The result of calling get_all_states.get_all_states. Cached for improved performance. state_to_information_state: A dict mapping str(state) to state.information_state for every state in the game. Cached for improved performance. **kwargs: kwargs """ super(BestResponseOracle, self).__init__(**kwargs) self.best_response_backend = best_response_backend if self.best_response_backend == 'cpp': # Should compute all_states and state_to_information_state only once in # the program, as caching them speeds up TabularBestResponse tremendously. self.all_states, self.state_to_information_state = ( utils.compute_states_and_info_states_if_none( game, all_states, state_to_information_state)) policy = openspiel_policy.UniformRandomPolicy(game) policy_to_dict = policy_utils.policy_to_dict( policy, game, self.all_states, self.state_to_information_state) # pylint: disable=g-complex-comprehension # Cache TabularBestResponse for players, due to their costly construction # TODO(b/140426861): Use a single best-responder once the code supports # multiple player ids. self.best_response_processors = [ pyspiel.TabularBestResponse(game, best_responder_id, policy_to_dict) for best_responder_id in range(game.num_players()) ] self.best_responders = [ best_response.CPPBestResponsePolicy( game, i_player, policy, self.all_states, self.state_to_information_state, self.best_response_processors[i_player]) for i_player in range(game.num_players()) ]
def __init__(self, game, best_responder_id, policy, all_states=None, state_to_information_state=None, best_response_processor=None, cut_threshold=0.0): """Constructor. Args: game: The game to analyze. best_responder_id: The player id of the best-responder. policy: A policy.Policy object representing the joint policy, taking a state and returning a list of (action, probability) pairs. This could be aggr_policy, for instance. all_states: The result of calling get_all_states.get_all_states. Cached for improved performance. state_to_information_state: A dict mapping state.history_str to state.information_state for every state in the game. Cached for improved performance. best_response_processor: A TabularBestResponse object, used for processing the best response actions. cut_threshold: The probability to cut when calculating the value. Increasing this value will trade off accuracy for speed. """ (self.all_states, self.state_to_information_state) = ( compute_states_and_info_states_if_none(game, all_states, state_to_information_state)) policy_to_dict = policy_utils.policy_to_dict( policy, game, self.all_states, self.state_to_information_state) # pylint: disable=g-complex-comprehension # Cache TabularBestResponse for players, due to their costly construction # TODO(b/140426861): Use a single best-responder once the code supports # multiple player ids. if not best_response_processor: best_response_processor = pyspiel.TabularBestResponse( game, best_responder_id, policy_to_dict) self._policy = policy self.game = game self.best_responder_id = best_responder_id self.tabular_best_response_map = ( best_response_processor.get_best_response_actions()) self._cut_threshold = cut_threshold
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player 0 <= player < game.num_players(). player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self.action_values = collections.defaultdict( lambda: collections.defaultdict(lambda: np.zeros(2))) self.info_state_prob = collections.defaultdict(float) self.info_state_player_prob = collections.defaultdict(float) self.info_state_cf_prob = collections.defaultdict(float) self.info_state_chance_prob = collections.defaultdict(float) self.get_action_values( self.game.new_initial_state(), { player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) # Collect normalized action values for each information state rv = [] cfrp = [] player_reach_probs_vs_br = [] for info_state in info_states: key = (player, info_state) av = self.action_values[key] norm_prob = self.info_state_prob[key] rv.append([(av[a][player] / norm_prob) if (a in av and norm_prob > 0) else 0 for a in range(self.num_actions)]) cfrp.append(self.info_state_cf_prob[key]) player_reach_probs_vs_br.append(self.info_state_player_prob[key]) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=rv, counterfactual_reach_probs_vs_br=cfrp, player_reach_probs_vs_br=player_reach_probs_vs_br)
def __call__(self, player, player_policy, info_states): """Computes action values per state for the player. Args: player: The id of the player (0 <= player < game.num_players()). This player will play `player_policy`, while the opponent will play a best response. player_policy: A `policy.Policy` object. info_states: A list of info state strings. Returns: A `_CalculatorReturn` nametuple. See its docstring for the documentation. """ self.player = player opponent = 1 - player def best_response_policy(state): infostate = state.information_state_string(opponent) action = best_response_actions[infostate] return [(action, 1.0)] # If the policy is a TabularPolicy, we can directly copy the infostate # strings & values from the class. This is significantly faster than having # to create the infostate strings. if isinstance(player_policy, policy.TabularPolicy): tabular_policy = { key: _tuples_from_policy(player_policy.policy_for_key(key)) for key in player_policy.state_lookup } # Otherwise, we have to calculate all the infostate strings everytime. This # is ~2x slower. else: # We cache these as they are expensive to compute & do not change. if self._all_states is None: self._all_states = get_all_states.get_all_states( self.game, depth_limit=-1, include_terminals=False, include_chance_states=False) self._state_to_information_state = { state: self._all_states[state].information_state_string() for state in self._all_states } tabular_policy = policy_utils.policy_to_dict( player_policy, self.game, self._all_states, self._state_to_information_state) # When constructed, TabularBestResponse does a lot of work; we can save that # work by caching it. if self._best_responder[player] is None: self._best_responder[player] = pyspiel.TabularBestResponse( self.game, opponent, tabular_policy) else: self._best_responder[player].set_policy(tabular_policy) # Computing the value at the root calculates best responses everywhere. history = str(self.game.new_initial_state()) best_response_value = self._best_responder[player].value(history) best_response_actions = self._best_responder[ player].get_best_response_actions() # Compute action values self._action_value_calculator.compute_all_states_action_values({ player: player_policy, opponent: policy.PolicyFromCallable(self.game, best_response_policy), }) obj = self._action_value_calculator._get_tabular_statistics( # pylint: disable=protected-access ((player, s) for s in info_states)) # Return values return _CalculatorReturn( exploitability=best_response_value, values_vs_br=obj.action_values, counterfactual_reach_probs_vs_br=obj.counterfactual_reach_probs, player_reach_probs_vs_br=obj.player_reach_probs)