コード例 #1
0
    def test_cpp_and_python_implementations_are_identical(self, game_name):
        game = pyspiel.load_game(game_name)

        python_policy = policy.UniformRandomPolicy(game)
        pyspiel_policy = pyspiel.UniformRandomPolicy(game)

        all_states = get_all_states.get_all_states(
            game,
            depth_limit=-1,
            include_terminals=False,
            include_chance_states=False,
            to_string=lambda s: s.information_state_string())

        for current_player in range(game.num_players()):
            python_br = best_response.BestResponsePolicy(
                game, current_player, python_policy)
            cpp_br = pyspiel.TabularBestResponse(
                game, current_player,
                pyspiel_policy).get_best_response_policy()

            for state in all_states.values():
                if state.current_player() != current_player:
                    continue

                # TODO(b/141737795): Decide what to do about this.
                self.assertEqual(
                    python_br.action_probabilities(state), {
                        a: prob
                        for a, prob in cpp_br.action_probabilities(
                            state).items() if prob != 0
                    })
コード例 #2
0
    def __init__(self,
                 best_response_backend='cpp',
                 game=None,
                 all_states=None,
                 state_to_information_state=None,
                 **kwargs):
        """Init function for the RLOracle.

    Args:
      best_response_backend: A string (either 'cpp' or 'py'), specifying the
        best response backend to use (C++ or python, respectively). The cpp
        backend should be preferred, generally, as it is significantly faster.
      game: The game on which the optimization process takes place.
      all_states: The result of calling get_all_states.get_all_states. Cached
        for improved performance.
      state_to_information_state: A dict mapping str(state) to
        state.information_state for every state in the game. Cached for improved
        performance.
      **kwargs: kwargs
    """
        super(BestResponseOracle, self).__init__(**kwargs)
        self.best_response_backend = best_response_backend
        if self.best_response_backend == 'cpp':
            # Should compute all_states and state_to_information_state only once in
            # the program, as caching them speeds up TabularBestResponse tremendously.
            self.all_states, self.state_to_information_state = (
                utils.compute_states_and_info_states_if_none(
                    game, all_states, state_to_information_state))

            policy = openspiel_policy.UniformRandomPolicy(game)

            policy_to_dict = policy_utils.policy_to_dict(
                policy, game, self.all_states, self.state_to_information_state)

            # pylint: disable=g-complex-comprehension
            # Cache TabularBestResponse for players, due to their costly construction
            # TODO(b/140426861): Use a single best-responder once the code supports
            # multiple player ids.
            self.best_response_processors = [
                pyspiel.TabularBestResponse(game, best_responder_id,
                                            policy_to_dict)
                for best_responder_id in range(game.num_players())
            ]
            self.best_responders = [
                best_response.CPPBestResponsePolicy(
                    game, i_player, policy, self.all_states,
                    self.state_to_information_state,
                    self.best_response_processors[i_player])
                for i_player in range(game.num_players())
            ]
コード例 #3
0
    def __init__(self,
                 game,
                 best_responder_id,
                 policy,
                 all_states=None,
                 state_to_information_state=None,
                 best_response_processor=None,
                 cut_threshold=0.0):
        """Constructor.

    Args:
      game: The game to analyze.
      best_responder_id: The player id of the best-responder.
      policy: A policy.Policy object representing the joint policy, taking a
        state and returning a list of (action, probability) pairs. This could be
        aggr_policy, for instance.
      all_states: The result of calling get_all_states.get_all_states. Cached
        for improved performance.
      state_to_information_state: A dict mapping state.history_str to
        state.information_state for every state in the game. Cached for improved
        performance.
      best_response_processor: A TabularBestResponse object, used for processing
        the best response actions.
      cut_threshold: The probability to cut when calculating the value.
        Increasing this value will trade off accuracy for speed.
    """
        (self.all_states, self.state_to_information_state) = (
            compute_states_and_info_states_if_none(game, all_states,
                                                   state_to_information_state))

        policy_to_dict = policy_utils.policy_to_dict(
            policy, game, self.all_states, self.state_to_information_state)

        # pylint: disable=g-complex-comprehension
        # Cache TabularBestResponse for players, due to their costly construction
        # TODO(b/140426861): Use a single best-responder once the code supports
        # multiple player ids.
        if not best_response_processor:
            best_response_processor = pyspiel.TabularBestResponse(
                game, best_responder_id, policy_to_dict)

        self._policy = policy
        self.game = game
        self.best_responder_id = best_responder_id
        self.tabular_best_response_map = (
            best_response_processor.get_best_response_actions())

        self._cut_threshold = cut_threshold
コード例 #4
0
    def __call__(self, player, player_policy, info_states):
        """Computes action values per state for the player.

    Args:
      player: The id of the player 0 <= player < game.num_players().
      player_policy: A `policy.Policy` object.
      info_states: A list of info state strings.

    Returns:
      A `_CalculatorReturn` nametuple. See its docstring for the documentation.
    """
        self.player = player
        opponent = 1 - player

        def best_response_policy(state):
            infostate = state.information_state_string(opponent)
            action = best_response_actions[infostate]
            return [(action, 1.0)]

        # If the policy is a TabularPolicy, we can directly copy the infostate
        # strings & values from the class. This is significantly faster than having
        # to create the infostate strings.
        if isinstance(player_policy, policy.TabularPolicy):
            tabular_policy = {
                key: _tuples_from_policy(player_policy.policy_for_key(key))
                for key in player_policy.state_lookup
            }
        # Otherwise, we have to calculate all the infostate strings everytime. This
        # is ~2x slower.
        else:
            # We cache these as they are expensive to compute & do not change.
            if self._all_states is None:
                self._all_states = get_all_states.get_all_states(
                    self.game,
                    depth_limit=-1,
                    include_terminals=False,
                    include_chance_states=False)
                self._state_to_information_state = {
                    state: self._all_states[state].information_state_string()
                    for state in self._all_states
                }
            tabular_policy = policy_utils.policy_to_dict(
                player_policy, self.game, self._all_states,
                self._state_to_information_state)

        # When constructed, TabularBestResponse does a lot of work; we can save that
        # work by caching it.
        if self._best_responder[player] is None:
            self._best_responder[player] = pyspiel.TabularBestResponse(
                self.game, opponent, tabular_policy)
        else:
            self._best_responder[player].set_policy(tabular_policy)

        # Computing the value at the root calculates best responses everywhere.
        history = str(self.game.new_initial_state())
        best_response_value = self._best_responder[player].value(history)
        best_response_actions = self._best_responder[
            player].get_best_response_actions()

        # Compute action values
        self.action_values = collections.defaultdict(
            lambda: collections.defaultdict(lambda: np.zeros(2)))
        self.info_state_prob = collections.defaultdict(float)
        self.info_state_player_prob = collections.defaultdict(float)
        self.info_state_cf_prob = collections.defaultdict(float)
        self.info_state_chance_prob = collections.defaultdict(float)
        self.get_action_values(
            self.game.new_initial_state(), {
                player:
                player_policy,
                opponent:
                policy.PolicyFromCallable(self.game, best_response_policy),
            })

        # Collect normalized action values for each information state
        rv = []
        cfrp = []
        player_reach_probs_vs_br = []
        for info_state in info_states:
            key = (player, info_state)
            av = self.action_values[key]
            norm_prob = self.info_state_prob[key]
            rv.append([(av[a][player] / norm_prob) if
                       (a in av and norm_prob > 0) else 0
                       for a in range(self.num_actions)])
            cfrp.append(self.info_state_cf_prob[key])
            player_reach_probs_vs_br.append(self.info_state_player_prob[key])

        # Return values
        return _CalculatorReturn(
            exploitability=best_response_value,
            values_vs_br=rv,
            counterfactual_reach_probs_vs_br=cfrp,
            player_reach_probs_vs_br=player_reach_probs_vs_br)
コード例 #5
0
  def __call__(self, player, player_policy, info_states):
    """Computes action values per state for the player.

    Args:
      player: The id of the player (0 <= player < game.num_players()). This
        player will play `player_policy`, while the opponent will play a best
        response.
      player_policy: A `policy.Policy` object.
      info_states: A list of info state strings.

    Returns:
      A `_CalculatorReturn` nametuple. See its docstring for the documentation.
    """
    self.player = player
    opponent = 1 - player

    def best_response_policy(state):
      infostate = state.information_state_string(opponent)
      action = best_response_actions[infostate]
      return [(action, 1.0)]

    # If the policy is a TabularPolicy, we can directly copy the infostate
    # strings & values from the class. This is significantly faster than having
    # to create the infostate strings.
    if isinstance(player_policy, policy.TabularPolicy):
      tabular_policy = {
          key: _tuples_from_policy(player_policy.policy_for_key(key))
          for key in player_policy.state_lookup
      }
    # Otherwise, we have to calculate all the infostate strings everytime. This
    # is ~2x slower.
    else:
      # We cache these as they are expensive to compute & do not change.
      if self._all_states is None:
        self._all_states = get_all_states.get_all_states(
            self.game,
            depth_limit=-1,
            include_terminals=False,
            include_chance_states=False)
        self._state_to_information_state = {
            state: self._all_states[state].information_state_string()
            for state in self._all_states
        }
      tabular_policy = policy_utils.policy_to_dict(
          player_policy, self.game, self._all_states,
          self._state_to_information_state)

    # When constructed, TabularBestResponse does a lot of work; we can save that
    # work by caching it.
    if self._best_responder[player] is None:
      self._best_responder[player] = pyspiel.TabularBestResponse(
          self.game, opponent, tabular_policy)
    else:
      self._best_responder[player].set_policy(tabular_policy)

    # Computing the value at the root calculates best responses everywhere.
    history = str(self.game.new_initial_state())
    best_response_value = self._best_responder[player].value(history)
    best_response_actions = self._best_responder[
        player].get_best_response_actions()

    # Compute action values
    self._action_value_calculator.compute_all_states_action_values({
        player: player_policy,
        opponent: policy.PolicyFromCallable(self.game, best_response_policy),
    })
    obj = self._action_value_calculator._get_tabular_statistics(  # pylint: disable=protected-access
        ((player, s) for s in info_states))

    # Return values
    return _CalculatorReturn(
        exploitability=best_response_value,
        values_vs_br=obj.action_values,
        counterfactual_reach_probs_vs_br=obj.counterfactual_reach_probs,
        player_reach_probs_vs_br=obj.player_reach_probs)