def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) python_policy = policy.UniformRandomPolicy(game) pyspiel_policy = pyspiel.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): python_br = best_response.BestResponsePolicy( game, current_player, python_policy) cpp_br = pyspiel.TabularBestResponse( game, current_player, pyspiel_policy).get_best_response_policy() for state in all_states.values(): if state.current_player() != current_player: continue # TODO(b/141737795): Decide what to do about this. self.assertEqual( python_br.action_probabilities(state), { a: prob for a, prob in cpp_br.action_probabilities( state).items() if prob != 0 })
def best_response(game, policy, player_id): """Returns information about the specified player's best response. Given a game and a policy for every player, computes for a single player their best unilateral strategy. Returns the value improvement that player would get, the action they should take in each information state, and the value of each state when following their unilateral policy. Args: game: An open_spiel game, e.g. kuhn_poker policy: A `policy.Policy` object. This policy should depend only on the information state available to the current player, but this is not enforced. player_id: The integer id of a player in the game for whom the best response will be computed. Returns: A dictionary of values, with keys: best_response_action: The best unilateral strategy for `player_id` as a map from infostatekey to action_id. best_response_state_value: The value obtained for `player_id` when unilaterally switching strategy, for each state. best_response_value: The value obtained for `player_id` when unilaterally switching strategy. info_sets: A dict of info sets, mapping info state key to a list of `(state, counterfactual_reach_prob)` pairs. nash_conv: `best_response_value - on_policy_value` on_policy_value: The value for `player_id` when all players follow the policy on_policy_values: The value for each player when all players follow the policy """ root_state = game.new_initial_state() br = pyspiel_best_response.BestResponsePolicy(game, player_id, policy, root_state) on_policy_values = _state_values(root_state, game.num_players(), policy) best_response_value = br.value(root_state) # Get best response action for unvisited states for infostate in set(br.infosets) - set(br.cache_best_response_action): br.best_response_action(infostate) return { "best_response_action": br.cache_best_response_action, "best_response_state_value": br.cache_value, "best_response_value": best_response_value, "info_sets": br.infosets, "nash_conv": best_response_value - on_policy_values[player_id], "on_policy_value": on_policy_values[player_id], "on_policy_values": on_policy_values, }
def test_cpp_and_python_value_are_identical(self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) test_policy = policy.TabularPolicy(game) root_state = game.new_initial_state() for i_player in range(num_players): best_resp_py_backend = best_response.BestResponsePolicy( game, i_player, test_policy) best_resp_cpp_backend = best_response.CPPBestResponsePolicy( game, i_player, test_policy) value_py_backend = best_resp_py_backend.value(root_state) value_cpp_backend = best_resp_cpp_backend.value(root_state) self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))
def test_best_response_is_a_policy(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) expected_policy = { "0": 1, # Bet in case opponent folds when winning "1": 1, # Bet in case opponent folds when winning "2": 0, # Both equally good (we return the lowest action) # Some of these will never happen under the best-response policy, # but we have computed best-response actions anyway. "0pb": 0, # Fold - we're losing "1pb": 1, # Call - we're 50-50 "2pb": 1, # Call - we've won } self.assertEqual( expected_policy, {key: br.best_response_action(key) for key in expected_policy.keys()})
def test_best_response_tic_tac_toe_value_is_consistent(self): # This test was failing because of use of str(state) in the best response, # which is imperfect recall. We now use state.history_str() throughout. # Chose a policy at random; not the uniform random policy. game = pyspiel.load_game("tic_tac_toe") pi = policy.TabularPolicy(game) rng = np.random.RandomState(1234) pi.action_probability_array[:] = rng.rand(*pi.legal_actions_mask.shape) pi.action_probability_array *= pi.legal_actions_mask pi.action_probability_array /= np.sum( pi.action_probability_array, axis=1, keepdims=True) # Compute a best response and verify the best response value is consistent. br = best_response.BestResponsePolicy(game, 1, pi) self.assertAlmostEqual( expected_game_score.policy_value(game.new_initial_state(), [pi, br])[1], br.value(game.new_initial_state()))
def nash_conv(game, policy, return_only_nash_conv=True, use_cpp_br=False): r"""Returns a measure of closeness to Nash for a policy in the game. See https://arxiv.org/pdf/1711.00832.pdf for the NashConv definition. Args: game: An open_spiel game, e.g. kuhn_poker policy: A `policy.Policy` object. This policy should depend only on the information state available to the current player, but this is not enforced. return_only_nash_conv: Whether to only return the NashConv value, or a namedtuple containing additional statistics. Prefer using `False`, as we hope to change the default to that value. use_cpp_br: if True, compute the best response in c++ Returns: Returns a object with the following attributes: - player_improvements: A `[num_players]` numpy array of the improvement for players (i.e. value_player_p_versus_BR - value_player_p). - nash_conv: The sum over all players of the improvements in value that each player could obtain by unilaterally changing their strategy, i.e. sum(player_improvements). """ root_state = game.new_initial_state() if use_cpp_br: best_response_values = np.array([ pyspiel_best_response.CPPBestResponsePolicy( game, best_responder, policy).value(root_state) for best_responder in range(game.num_players()) ]) else: best_response_values = np.array([ pyspiel_best_response.BestResponsePolicy( game, best_responder, policy).value(root_state) for best_responder in range(game.num_players()) ]) on_policy_values = _state_values(root_state, game.num_players(), policy) player_improvements = best_response_values - on_policy_values nash_conv_ = sum(player_improvements) if return_only_nash_conv: return nash_conv_ else: return _NashConvReturn( nash_conv=nash_conv_, player_improvements=player_improvements)
def test_best_response_prisoner_dilemma_simultaneous_game(self): """Test best response computation for simultaneous game.""" game = pyspiel.load_game( "python_iterated_prisoners_dilemma(max_game_length=5)") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) # Best policy is always to defect; we verify this for a handful of states self.assertEqual(br.best_response_action("us:CCCC op:CCCC"), 1) self.assertEqual(br.best_response_action("us:DDDD op:CCCC"), 1) self.assertEqual(br.best_response_action("us:CDCD op:DCDC"), 1) self.assertEqual(br.best_response_action("us:CCCC op:DDDD"), 1) # Expected value per turn = 5.5 (avg of 1 and 10) # Expected game length = sum(0.875**i for i in range(5)) = 3.896728515625 # Game value = 5.5 * 3.896728515625 = 21.4320068359375 self.assertAlmostEqual(br.value(game.new_initial_state()), 21.4320068359375)
def test_cpp_and_python_best_response_are_identical(self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) test_policy = policy.TabularPolicy(game) for i_player in range(num_players): best_resp_py_backend = best_response.BestResponsePolicy( game, i_player, test_policy) best_resp_cpp_backend = best_response.CPPBestResponsePolicy( game, i_player, test_policy) for state in best_resp_cpp_backend.all_states.values(): if i_player == state.current_player(): py_dict = best_resp_py_backend.action_probabilities(state) cpp_dict = best_resp_cpp_backend.action_probabilities(state) # We do check like this, because the actions associated to a 0. prob # do not necessarily appear for key, value in py_dict.items(): self.assertEqual(value, cpp_dict.get(key, 0.)) for key, value in cpp_dict.items(): self.assertEqual(value, py_dict.get(key, 0.))
def test_best_response_oshi_zumo_simultaneous_game(self): """Test best response computation for simultaneous game.""" game = pyspiel.load_game("oshi_zumo(horizon=5,coins=5)") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) expected_policy = { "0, 0, 0, 3, 0, 2": 1, "0, 0, 1, 4, 3, 1": 0, "0, 0, 4, 1, 0, 2, 0, 2": 1, "0, 1, 1, 0, 1, 4": 1, "0, 1, 4, 1, 0, 0, 0, 1": 1, "0, 2, 2, 2, 3, 0, 0, 0": 0, "0, 5, 0, 0, 0, 0, 3, 0": 1 } self.assertEqual( expected_policy, {key: br.best_response_action(key) for key in expected_policy}) self.assertAlmostEqual(br.value(game.new_initial_state()), 0.856471051954)
def exploitability(game, policy): """Returns the exploitability of the policy in the game. This is implemented only for 2 players constant-sum games, and is equivalent to NashConv / num_players in that case. Prefer using `nash_conv`. Args: game: An open_spiel game, e.g. kuhn_poker policy: A `policy.Policy` object. This policy should depend only on the information state available to the current player, but this is not enforced. Returns: The value that this policy achieves when playing against the worst-case non-cheating opponent, averaged across both starting positions. It has a minimum of zero (assuming the supplied policy is non-cheating) and this bound is achievable in a 2p game. Raises: ValueError if the game is not a two-player constant-sum turn-based game. """ if game.num_players() != 2: raise ValueError("Game must be a 2-player game") game_info = game.get_type() if game_info.dynamics != pyspiel.GameType.Dynamics.SEQUENTIAL: raise ValueError("The game must be turn-based, not {}".format( game_info.dynamics)) if game_info.utility not in (pyspiel.GameType.Utility.ZERO_SUM, pyspiel.GameType.Utility.CONSTANT_SUM): raise ValueError( "The game must be constant- or zero-sum, not {}".format( game_info.utility)) root_state = game.new_initial_state() nash_conv_value = (sum( pyspiel_best_response.BestResponsePolicy(game, best_responder, policy, root_state).value(root_state) for best_responder in range(game.num_players())) - game.utility_sum()) return nash_conv_value / game.num_players()
def __call__(self, game, training_parameters, strategy_sampler=utils.sample_strategy, using_joint_strategies=False, **oracle_specific_execution_kwargs): """Call method for oracle, returns best responses for training_parameters. Args: game: The game on which the optimization process takes place. training_parameters: List of list of dicts: one list per player, one dict per selected agent in the pool for each player, each dictionary containing the following fields: - policy: the policy from which to start training. - total_policies: A list of all policy.Policy strategies used for training, including the one for the current player. Either marginalized or joint strategies are accepted. - current_player: Integer representing the current player. - probabilities_of_playing_policies: A list of arrays representing, per player, the probabilities of playing each policy in total_policies for the same player. strategy_sampler: Callable that samples strategies from `total_policies` using `probabilities_of_playing_policies`. It only samples one joint "action" for all players. Implemented to be able to take into account joint probabilities of action. using_joint_strategies: Whether the meta-strategies sent are joint (True) or marginalized. **oracle_specific_execution_kwargs: Other set of arguments, for compatibility purposes. Can for example represent whether to Rectify Training or not. Returns: A list of list of OpenSpiel Policy objects representing the expected best response, following the same structure as training_parameters. """ new_policies = [] for player_parameters in training_parameters: player_policies = [] for params in player_parameters: current_player = params['current_player'] total_policies = params['total_policies'] probabilities_of_playing_policies = params[ 'probabilities_of_playing_policies'] if using_joint_strategies: aggr_policy = utils.aggregate_joint_policies( game, utils.marginal_to_joint(total_policies), probabilities_of_playing_policies.reshape(-1)) else: aggr_policy = utils.aggregate_policies( game, total_policies, probabilities_of_playing_policies) # This takes as input an aggregate policy, and computes a best response # for current_player at the applicable information states by recursing # through the game tree. At information states involving other players # or chance, the aggr_policy is used to compute the expected value, such # that a best response for current_player can be computed. if self.best_response_backend == 'py': best_resp = best_response.BestResponsePolicy( game, current_player, aggr_policy) else: self.best_response_processors[current_player].set_policy( policy_utils.policy_to_dict( aggr_policy, game, self.all_states, self.state_to_information_state)) self.best_responders[current_player] = ( best_response.CPPBestResponsePolicy( game, current_player, aggr_policy, self.all_states, self.state_to_information_state, self.best_response_processors[current_player])) best_resp = self.best_responders[current_player] player_policies.append(best_resp) new_policies.append(player_policies) return new_policies