def test_cpp_and_python_value_are_identical(self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})
        test_policy = policy.TabularPolicy(game)
        root_state = game.new_initial_state()
        for i_player in range(num_players):
            best_resp_py_backend = best_response.BestResponsePolicy(
                game, i_player, test_policy)
            best_resp_cpp_backend = best_response.CPPBestResponsePolicy(
                game, i_player, test_policy)

            value_py_backend = best_resp_py_backend.value(root_state)
            value_cpp_backend = best_resp_cpp_backend.value(root_state)

            self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))
    def __init__(self,
                 best_response_backend='cpp',
                 game=None,
                 all_states=None,
                 state_to_information_state=None,
                 **kwargs):
        """Init function for the RLOracle.

    Args:
      best_response_backend: A string (either 'cpp' or 'py'), specifying the
        best response backend to use (C++ or python, respectively). The cpp
        backend should be preferred, generally, as it is significantly faster.
      game: The game on which the optimization process takes place.
      all_states: The result of calling get_all_states.get_all_states. Cached
        for improved performance.
      state_to_information_state: A dict mapping str(state) to
        state.information_state for every state in the game. Cached for improved
        performance.
      **kwargs: kwargs
    """
        super(BestResponseOracle, self).__init__(**kwargs)
        self.best_response_backend = best_response_backend
        if self.best_response_backend == 'cpp':
            # Should compute all_states and state_to_information_state only once in
            # the program, as caching them speeds up TabularBestResponse tremendously.
            self.all_states, self.state_to_information_state = (
                utils.compute_states_and_info_states_if_none(
                    game, all_states, state_to_information_state))

            policy = openspiel_policy.UniformRandomPolicy(game)

            policy_to_dict = policy_utils.policy_to_dict(
                policy, game, self.all_states, self.state_to_information_state)

            # pylint: disable=g-complex-comprehension
            # Cache TabularBestResponse for players, due to their costly construction
            # TODO(b/140426861): Use a single best-responder once the code supports
            # multiple player ids.
            self.best_response_processors = [
                pyspiel.TabularBestResponse(game, best_responder_id,
                                            policy_to_dict)
                for best_responder_id in range(game.num_players())
            ]
            self.best_responders = [
                best_response.CPPBestResponsePolicy(
                    game, i_player, policy, self.all_states,
                    self.state_to_information_state,
                    self.best_response_processors[i_player])
                for i_player in range(game.num_players())
            ]
Exemple #3
0
def nash_conv(game, policy, return_only_nash_conv=True, use_cpp_br=False):
  r"""Returns a measure of closeness to Nash for a policy in the game.

  See https://arxiv.org/pdf/1711.00832.pdf for the NashConv definition.

  Args:
    game: An open_spiel game, e.g. kuhn_poker
    policy: A `policy.Policy` object. This policy should depend only on the
      information state available to the current player, but this is not
      enforced.
    return_only_nash_conv: Whether to only return the NashConv value, or a
      namedtuple containing additional statistics. Prefer using `False`, as we
      hope to change the default to that value.
    use_cpp_br: if True, compute the best response in c++

  Returns:
    Returns a object with the following attributes:
    - player_improvements: A `[num_players]` numpy array of the improvement
      for players (i.e. value_player_p_versus_BR - value_player_p).
    - nash_conv: The sum over all players of the improvements in value that each
      player could obtain by unilaterally changing their strategy, i.e.
      sum(player_improvements).
  """
  root_state = game.new_initial_state()
  if use_cpp_br:
    best_response_values = np.array([
        pyspiel_best_response.CPPBestResponsePolicy(
            game, best_responder, policy).value(root_state)
        for best_responder in range(game.num_players())
    ])
  else:
    best_response_values = np.array([
        pyspiel_best_response.BestResponsePolicy(
            game, best_responder, policy).value(root_state)
        for best_responder in range(game.num_players())
    ])
  on_policy_values = _state_values(root_state, game.num_players(), policy)
  player_improvements = best_response_values - on_policy_values
  nash_conv_ = sum(player_improvements)
  if return_only_nash_conv:
    return nash_conv_
  else:
    return _NashConvReturn(
        nash_conv=nash_conv_, player_improvements=player_improvements)
Exemple #4
0
  def test_cpp_and_python_best_response_are_identical(self, game_name,
                                                      num_players):
    game = pyspiel.load_game(game_name, {"players": num_players})

    test_policy = policy.TabularPolicy(game)
    for i_player in range(num_players):
      best_resp_py_backend = best_response.BestResponsePolicy(
          game, i_player, test_policy)
      best_resp_cpp_backend = best_response.CPPBestResponsePolicy(
          game, i_player, test_policy)
      for state in best_resp_cpp_backend.all_states.values():
        if i_player == state.current_player():
          py_dict = best_resp_py_backend.action_probabilities(state)
          cpp_dict = best_resp_cpp_backend.action_probabilities(state)

          # We do check like this, because the actions associated to a 0. prob
          # do not necessarily appear
          for key, value in py_dict.items():
            self.assertEqual(value, cpp_dict.get(key, 0.))
          for key, value in cpp_dict.items():
            self.assertEqual(value, py_dict.get(key, 0.))
Exemple #5
0
def exploitability(game, policy):
  """Returns the exploitability of the policy in the game.

  This is implemented only for 2 players constant-sum games, and is equivalent
  to NashConv / num_players in that case. Prefer using `nash_conv`.

  Args:
    game: An open_spiel game, e.g. kuhn_poker
    policy: A `policy.Policy` object. This policy should depend only on the
      information state available to the current player, but this is not
      enforced.

  Returns:
    The value that this policy achieves when playing against the worst-case
    non-cheating opponent, averaged across both starting positions. It has a
    minimum of zero (assuming the supplied policy is non-cheating) and
    this bound is achievable in a 2p game.

  Raises:
    ValueError if the game is not a two-player constant-sum turn-based game.
  """
  if game.num_players() != 2:
    raise ValueError("Game must be a 2-player game")
  game_info = game.get_type()
  if game_info.dynamics != pyspiel.GameType.Dynamics.SEQUENTIAL:
    raise ValueError("The game must be turn-based, not {}".format(
        game_info.dynamics))
  if game_info.utility not in (pyspiel.GameType.Utility.ZERO_SUM,
                               pyspiel.GameType.Utility.CONSTANT_SUM):
    raise ValueError("The game must be constant- or zero-sum, not {}".format(
        game_info.utility))
  root_state = game.new_initial_state()
  nash_conv_value = (
      sum(
          pyspiel_best_response.CPPBestResponsePolicy(
              game, best_responder, policy).value(root_state)
          for best_responder in range(game.num_players())) - game.utility_sum())
  return nash_conv_value / game.num_players()
    def __call__(self,
                 game,
                 training_parameters,
                 strategy_sampler=utils.sample_strategy,
                 using_joint_strategies=False,
                 **oracle_specific_execution_kwargs):
        """Call method for oracle, returns best responses for training_parameters.

    Args:
      game: The game on which the optimization process takes place.
      training_parameters: List of list of dicts: one list per player, one dict
        per selected agent in the pool for each player,
        each dictionary containing the following fields:
        - policy: the policy from which to start training.
        - total_policies: A list of all policy.Policy strategies used for
          training, including the one for the current player. Either
          marginalized or joint strategies are accepted.
        - current_player: Integer representing the current player.
        - probabilities_of_playing_policies: A list of arrays representing, per
          player, the probabilities of playing each policy in total_policies for
          the same player.
      strategy_sampler: Callable that samples strategies from `total_policies`
        using `probabilities_of_playing_policies`. It only samples one joint
        "action" for all players. Implemented to be able to take into account
        joint probabilities of action.
      using_joint_strategies: Whether the meta-strategies sent are joint (True)
        or marginalized.
      **oracle_specific_execution_kwargs: Other set of arguments, for
        compatibility purposes. Can for example represent whether to Rectify
        Training or not.

    Returns:
      A list of list of OpenSpiel Policy objects representing the expected
      best response, following the same structure as training_parameters.
    """
        new_policies = []
        for player_parameters in training_parameters:
            player_policies = []
            for params in player_parameters:
                current_player = params['current_player']
                total_policies = params['total_policies']
                probabilities_of_playing_policies = params[
                    'probabilities_of_playing_policies']
                if using_joint_strategies:
                    aggr_policy = utils.aggregate_joint_policies(
                        game, utils.marginal_to_joint(total_policies),
                        probabilities_of_playing_policies.reshape(-1))
                else:
                    aggr_policy = utils.aggregate_policies(
                        game, total_policies,
                        probabilities_of_playing_policies)

                # This takes as input an aggregate policy, and computes a best response
                # for current_player at the applicable information states by recursing
                # through the game tree. At information states involving other players
                # or chance, the aggr_policy is used to compute the expected value, such
                # that a best response for current_player can be computed.
                if self.best_response_backend == 'py':
                    best_resp = best_response.BestResponsePolicy(
                        game, current_player, aggr_policy)
                else:
                    self.best_response_processors[current_player].set_policy(
                        policy_utils.policy_to_dict(
                            aggr_policy, game, self.all_states,
                            self.state_to_information_state))

                    self.best_responders[current_player] = (
                        best_response.CPPBestResponsePolicy(
                            game, current_player, aggr_policy, self.all_states,
                            self.state_to_information_state,
                            self.best_response_processors[current_player]))
                    best_resp = self.best_responders[current_player]
                player_policies.append(best_resp)
            new_policies.append(player_policies)
        return new_policies