Example #1
0
def load_strategy(strategy_type, strategy_kwargs, env, player_id, strategy_weight):
    """
    Load Strategies. If initialization required, initialize
    """
    if strategy_type == "BR":
        agent = policy.TabularPolicy(env.game)
        agent.set_weight(strategy_weight)
        return agent
    elif strategy_type == "ARS":
        agent_class = rl_policy.ARSPolicy
    elif strategy_type == "DQN":
        agent_class = rl_policy.DQNPolicy
    elif strategy_type == "PG":
        agent_class = rl_policy.PGPolicy
    elif strategy_type == "ARS_parallel":
        agent_class = rl_policy.ARSPolicy_parallel
    else:
        raise NotImplementedError

    if "ARS" in strategy_type:
        strategy_kwargs["session"] = None
    else:
        strategy_kwargs["session"] =  tf.Session()
    
    agent = agent_class(env, player_id, **strategy_kwargs)
    agent.set_weights(strategy_weight)
    agent.freeze()

    return agent
Example #2
0
    def __init__(self,
                 game,
                 initialize_cumulative_values=False,
                 linear_averaging=True,
                 regret_matching_plus=True):
        # pyformat: disable
        """Initializer.

    Args:
      game: The `pyspiel.Game` to run on.
      initialize_cumulative_values: Whether to initialize the average policy to
        the uniform policy (and the initial cumulative regret to an epsilon
        value). This is independent of the first CFR iteration, which, when the
        policy is fixed during traversal and we perform non alternating updates,
        will also compute the uniform policy and add it to the average of
        policies.
      linear_averaging: Whether to use linear averaging, i.e.
        cumulative_policy[info_state][action] += (
          iteration_number * reach_prob * action_prob)

        or not:

        cumulative_policy[info_state][action] += reach_prob * action_prob
      regret_matching_plus: Whether to use Regret Matching+:
        cumulative_regrets = max(cumulative_regrets + regrets, 0)
        or simply regret matching:
        cumulative_regrets = cumulative_regrets + regrets
    """
        # pyformat: enable
        if game.num_players() != 2:
            raise ValueError("Game {} does not have {} players.".format(
                game, 2))

        self._game = game
        self._num_players = game.num_players()
        self._root_node = self._game.new_initial_state()

        if initialize_cumulative_values:
            initial_positive_value = _INITIAL_POSITIVE_VALUE
        else:
            initial_positive_value = 0
        self._info_state_nodes = {}
        _initialize_info_state_nodes(
            self._root_node,
            info_state_nodes=self._info_state_nodes,
            initial_positive_value=initial_positive_value)
        self._policy_cache = {}

        self._root_node = self._game.new_initial_state()

        # This is for returning the current policy and average policy to a caller
        self._current_policy = policy.TabularPolicy(game)
        self._average_policy = self._current_policy.__copy__()

        self._linear_averaging = linear_averaging
        self._iteration = 0  # For possible linear-averaging.

        self._regret_matching_plus = regret_matching_plus

        self._best_responses = {i: None for i in range(game.num_players())}
def main(_):
    game = pyspiel.load_game(FLAGS.game)
    evaluator = pyspiel.RandomRolloutEvaluator(1, SEED)
    min_expl = game.max_utility() - game.min_utility()

    print("{:>5} {:>10} {:>50} {:>20}".format("max_sims", "uct_c",
                                              "final_policy_type",
                                              "exploitability"))
    for max_simulations in [10, 100, 1000, 10000]:
        for uct_c in [0.2, 0.5, 1.0, 2.0, 4.0]:  # These values are for Kuhn.
            for final_policy_type in [
                    pyspiel.ISMCTSFinalPolicyType.NORMALIZED_VISIT_COUNT,
                    pyspiel.ISMCTSFinalPolicyType.MAX_VISIT_COUNT,
                    pyspiel.ISMCTSFinalPolicyType.MAX_VALUE
            ]:
                tabular_policy = policy.TabularPolicy(game)
                bot = pyspiel.ISMCTSBot(SEED, evaluator, uct_c,
                                        max_simulations, -1, final_policy_type,
                                        False, False)
                searched = {}
                construct_is_mcts_policy(game, game.new_initial_state(),
                                         tabular_policy, bot, searched)
                expl = exploitability.exploitability(game, tabular_policy)
                print("{:>5} {:>10} {:>50} {:>20}".format(
                    max_simulations, uct_c, str(final_policy_type), expl))
                if expl < min_expl:
                    min_expl = expl
    print("Min expl: {}".format(min_expl))
Example #4
0
def kuhn_nash_equilibrium(alpha):
    """Returns a Nash Equilibrium in Kuhn parameterized by alpha in [0, 1/3].

  See https://en.wikipedia.org/wiki/Kuhn_poker#Optimal_strategy

  Args:
    alpha: The probability to bet on a Jack for Player 0.

  Raises:
    ValueError: If `alpha` is not within [0, 1/3].
  """
    if not 0 <= alpha <= 1 / 3:
        raise ValueError("alpha ({}) must be in [0, 1/3]".format(alpha))
    bet_probability = {
        # Player 0
        "0": alpha,
        "0pb": 0,
        "1": 0,
        "1pb": 1 / 3 + alpha,
        "2": 3 * alpha,
        "2pb": 1,
        # Player 1
        "0p": 1 / 3,
        "0b": 0,
        "1p": 0,
        "1b": 1 / 3,
        "2p": 1,
        "2b": 1,
    }
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)
    for state, p in bet_probability.items():
        tabular_policy.policy_for_key(state)[:] = [1 - p, p]
    return tabular_policy
Example #5
0
    def __init__(self, game, initialize_cumulative_values, alternating_updates,
                 linear_averaging, regret_matching_plus):
        # pyformat: disable
        """Initializer.

    Args:
      game: The `pyspiel.Game` to run on.
      initialize_cumulative_values: Whether to initialize the average policy to
        the uniform policy (and the initial cumulative regret to an epsilon
        value). This is independent of the first CFR iteration, which, when the
        policy is fixed during traversal and we perform non alternating updates,
        will also compute the uniform policy and add it to the average of
        policies.
      alternating_updates: If `True`, alternating updates are performed: for
        each player, we compute and update the cumulative regrets and policies.
        In that case, and when the policy is frozen during tree traversal, the
        cache is reset after each update for one player.
        Otherwise, the update is simultaneous.
      linear_averaging: Whether to use linear averaging, i.e.
        cumulative_policy[info_state][action] += (
          iteration_number * reach_prob * action_prob)

        or not:

        cumulative_policy[info_state][action] += reach_prob * action_prob
      regret_matching_plus: Whether to use Regret Matching+:
        cumulative_regrets = max(cumulative_regrets + regrets, 0)
        or simply regret matching:
        cumulative_regrets = cumulative_regrets + regrets
    """
        # pyformat: enable
        self._game = game
        self._num_players = game.num_players()
        self._root_node = self._game.new_initial_state()

        # Map from information states string representations and actions to the
        # counterfactual regrets, accumulated over the policy iterations
        self._cumulative_regret = collections.defaultdict(
            lambda: collections.defaultdict(float))
        # Same as above for the cumulative of the policy probabilities computed
        # during the policy iterations
        self._cumulative_policy = collections.defaultdict(
            lambda: collections.defaultdict(float))
        if initialize_cumulative_values:
            _initialize_uniform_policy(self._root_node,
                                       self._cumulative_regret,
                                       self._cumulative_policy)

        self._policy = {}

        self._root_node = self._game.new_initial_state()

        # This is for returning the current average policy to a caller
        self._average_policy = policy.TabularPolicy(game)

        self._linear_averaging = linear_averaging
        self._iteration = 0  # For possible linear-averaging.

        self._alternating_updates = alternating_updates
        self._regret_matching_plus = regret_matching_plus
Example #6
0
def init_br_responder(env):
    """Initializes the tabular best-response based responder and agents."""
    random_policy = policy.TabularPolicy(env.game)
    oracle = best_response_oracle.BestResponseOracle(game=env.game,
                                                     policy=random_policy)
    agents = [random_policy.__copy__() for _ in range(FLAGS.n_players)]
    return oracle, agents
 def __init__(self, game):
   """Initializes a loss calculation for the given game."""
   if game.num_players() != _NUM_PLAYERS:
     raise ValueError("Game {} does not have {} players.".format(
         game, _NUM_PLAYERS))
   self.tabular_policy = policy.TabularPolicy(game)
   self.q_value_calculator = action_value_vs_best_response.Calculator(game)
  def test__update_current_policy(self):
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)

    cumulative_regrets = np.arange(0, 12 * 2).reshape((12, 2))
    expected_policy = cumulative_regrets / np.sum(
        cumulative_regrets, axis=-1, keepdims=True)
    nodes_indices = {
        u"0": 0,
        u"0pb": 1,
        u"1": 2,
        u"1pb": 3,
        u"2": 4,
        u"2pb": 5,
        u"1p": 6,
        u"1b": 7,
        u"2p": 8,
        u"2b": 9,
        u"0p": 10,
        u"0b": 11,
    }
    # pylint: disable=g-complex-comprehension
    info_state_nodes = {
        key: cfr._InfoStateNode(
            legal_actions=[0, 1],
            index_in_tabular_policy=None,
            cumulative_regret=dict(enumerate(cumulative_regrets[index])),
            cumulative_policy=None) for key, index in nodes_indices.items()
    }
    # pylint: enable=g-complex-comprehension

    cfr._update_current_policy(tabular_policy, info_state_nodes)

    np.testing.assert_array_equal(expected_policy,
                                  tabular_policy.action_probability_array)
def test_tabular_policy_from_csv(tmpdir):
    game = pyspiel.load_game("kuhn_poker")
    output = os.path.join(tmpdir, 'policy.csv')
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    tabular_policy_from_csv(game, output)
Example #10
0
def intilize_policy(game, player, policy_init):
  """Returns initial policy."""
  if policy_init == "uniform":
    new_policy = policy.TabularPolicy(game, players=(player,))

  elif policy_init == "random_deterministic":
    new_policy = policy.TabularPolicy(game, players=(player,))
    for i in range(new_policy.action_probability_array.shape[0]):
      new_policy.action_probability_array[i] = np.random.multinomial(
          1, new_policy.action_probability_array[i]).astype(np.float64)

  else:
    raise ValueError(
        "policy_init must be a valid initialization strategy: %s. "
        "Received: %s" % (INIT_POLICIES, policy_init))

  return new_policy
def csv_policy(tmpdir):
    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    return output
Example #12
0
def main(unused_argv):
    env = rl_environment.Environment(FLAGS.game_name)

    policies = [[
        policy.TabularPolicy(env.game).copy_with_noise(alpha=float(i),
                                                       beta=1.0)
        for i in range(2)
    ] for _ in range(2)]  # pylint: disable=g-complex-comprehension

    probabilities = [
        list(np.ones(len(policies[i])) / len(policies[i])) for i in range(2)
    ]

    pol_ag = policy_aggregator.PolicyAggregator(env.game)
    aggr_policies = pol_ag.aggregate([0, 1], policies, probabilities)

    exploitabilities = exploitability.nash_conv(env.game, aggr_policies)
    print("Exploitability : {}".format(exploitabilities))

    print(policies[0][0].action_probability_array)
    print(policies[0][1].action_probability_array)
    print(aggr_policies.policy)

    print("\nCopy Example")

    mother_policy = policy.TabularPolicy(env.game).copy_with_noise(1, 10)
    policies = [[mother_policy.__copy__() for _ in range(2)] for _ in range(2)]
    probabilities = [
        list(np.ones(len(policies)) / len(policies)) for _ in range(2)
    ]

    pol_ag = policy_aggregator.PolicyAggregator(env.game)
    aggr_policy = pol_ag.aggregate([0], policies, probabilities)

    for state, value in aggr_policy.policy[0].items():
        polici = mother_policy.policy_for_key(state)

        value_normal = {
            action: probability
            for action, probability in enumerate(polici) if probability > 0
        }
        for key in value.keys():
            print(
                "State : {}. Key : {}. Aggregated : {}. Real : {}. Passed : {}"
                .format(state, key, value[key], value_normal[key],
                        np.abs(value[key] - value_normal[key]) < 1e-8))
Example #13
0
  def __init__(self, game, initialize_cumulative_values, alternating_updates,
               linear_averaging, regret_matching_plus):
    # pyformat: disable
    """Initializer.

    Args:
      game: The `pyspiel.Game` to run on.
      initialize_cumulative_values: Whether to initialize the average policy to
        the uniform policy (and the initial cumulative regret to an epsilon
        value). This is independent of the first CFR iteration, which, when the
        policy is fixed during traversal and we perform non alternating updates,
        will also compute the uniform policy and add it to the average of
        policies.
      alternating_updates: If `True`, alternating updates are performed: for
        each player, we compute and update the cumulative regrets and policies.
        In that case, and when the policy is frozen during tree traversal, the
        cache is reset after each update for one player.
        Otherwise, the update is simultaneous.
      linear_averaging: Whether to use linear averaging, i.e.
        cumulative_policy[info_state][action] += (
          iteration_number * reach_prob * action_prob)

        or not:

        cumulative_policy[info_state][action] += reach_prob * action_prob
      regret_matching_plus: Whether to use Regret Matching+:
        cumulative_regrets = max(cumulative_regrets + regrets, 0)
        or simply regret matching:
        cumulative_regrets = cumulative_regrets + regrets
    """
    # pyformat: enable
    self._game = game
    self._num_players = game.num_players()
    self._root_node = self._game.new_initial_state()

    if initialize_cumulative_values:
      initial_positive_value = _INITIAL_POSITIVE_VALUE
    else:
      initial_positive_value = 0
    self._info_state_nodes = {}
    _initialize_info_state_nodes(
        self._root_node,
        info_state_nodes=self._info_state_nodes,
        initial_positive_value=initial_positive_value)

    self._policy_cache = {}

    self._root_node = self._game.new_initial_state()

    # This is for returning the current policy and average policy to a caller
    self._current_policy = policy.TabularPolicy(game)
    self._average_policy = self._current_policy.__copy__()

    self._linear_averaging = linear_averaging
    self._iteration = 0  # For possible linear-averaging.

    self._alternating_updates = alternating_updates
    self._regret_matching_plus = regret_matching_plus
Example #14
0
 def test_update_slice(self):
     game = pyspiel.load_game("kuhn_poker")
     tabular_policy = policy.TabularPolicy(game)
     state = "2b"
     np.testing.assert_array_equal(tabular_policy.policy_for_key(state),
                                   [0.5, 0.5])
     tabular_policy.policy_for_key(state)[:] = [0.8, 0.2]
     np.testing.assert_array_equal(tabular_policy.policy_for_key(state),
                                   [0.8, 0.2])
Example #15
0
    def test_states(self):
        game = pyspiel.load_game("leduc_poker")
        tabular_policy = policy.TabularPolicy(game)
        i = 0
        for state in tabular_policy.states:
            self.assertEqual(i, tabular_policy.state_index(state))
            i += 1

        self.assertEqual(936, i)
Example #16
0
    def test_cpp_to_python_policy(self):
        game = pyspiel.load_game("kuhn_poker")
        pyspiel_policy = pyspiel.UniformRandomPolicy(game)
        python_policy = policy.policy_from_pyspiel_policy(pyspiel_policy)

        for info_state_str in policy.TabularPolicy(game).state_lookup.keys():
            self.assertEqual({
                0: 0.5,
                1: 0.5
            }, python_policy.action_probabilities(info_state_str))
Example #17
0
 def test_update_elementwise(self):
     game = pyspiel.load_game("kuhn_poker")
     tabular_policy = policy.TabularPolicy(game)
     state = "0pb"
     np.testing.assert_array_equal(tabular_policy.policy_for_key(state),
                                   [0.5, 0.5])
     tabular_policy.policy_for_key(state)[0] = 0.9
     tabular_policy.policy_for_key(state)[1] = 0.1
     np.testing.assert_array_equal(tabular_policy.policy_for_key(state),
                                   [0.9, 0.1])
    def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv(
            self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})

        # TabularPolicy defaults to being a uniform random policy.
        test_policy = policy.TabularPolicy(game)
        python_nash_conv = exploitability.nash_conv(game, test_policy)
        cpp_nash_conv = pyspiel.nash_conv(
            game, policy_utils.policy_to_dict(test_policy, game))
        self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_play_tournament(tmpdir):
    game = pyspiel.load_game("kuhn_poker")
    for team in ["python", "ruby", "java"]:
        for player in ["p1", "p2"]:
            tabular_policy = policy.TabularPolicy(game)
            # Save policy as CSV
            output = os.path.join(tmpdir, f'{team}_{player}.csv')
            policy_to_csv(game, tabular_policy, output)
    ranking, results = play_tournament(game, str(tmpdir))
    assert len(list(ranking.keys())) == 3
    assert len(results) == 3 * 2 * 2
Example #20
0
    def test_cpp_and_python_value_are_identical(self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})
        test_policy = policy.TabularPolicy(game)
        root_state = game.new_initial_state()
        for i_player in range(num_players):
            best_resp_py_backend = best_response.BestResponsePolicy(
                game, i_player, test_policy)
            best_resp_cpp_backend = best_response.CPPBestResponsePolicy(
                game, i_player, test_policy)

            value_py_backend = best_resp_py_backend.value(root_state)
            value_cpp_backend = best_resp_cpp_backend.value(root_state)

            self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))
Example #21
0
class CommonTest(parameterized.TestCase):
    @parameterized.parameters([
        policy.TabularPolicy(_LEDUC_POKER),
        policy.UniformRandomPolicy(_LEDUC_POKER),
        policy.FirstActionPolicy(_LEDUC_POKER),
    ])
    def test_policy_on_leduc(self, policy_object):
        test_policy_on_game(self, _LEDUC_POKER, policy_object)

    @parameterized.named_parameters([
        ("pyspiel.UniformRandom", pyspiel.UniformRandomPolicy(_LEDUC_POKER)),
    ])
    def test_cpp_policies_on_leduc(self, policy_object):
        test_policy_on_game(self, _LEDUC_POKER, policy_object)
Example #22
0
  def test_record_batched_trajectories(self):
    for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]:
      game = pyspiel.load_game(game_name)
      python_policy = policy.TabularPolicy(game)
      tabular_policy = policy.python_policy_to_pyspiel_policy(python_policy)
      policies = [tabular_policy] * 2

      # We test that we can create a batch of trajectories.
      seed = 0
      batch_size = 128
      include_full_observations = False
      pyspiel.record_batched_trajectories(game, policies,
                                          python_policy.state_lookup,
                                          batch_size, include_full_observations,
                                          seed, -1)
Example #23
0
    def test_cpp_python_best_response_oracle(self, game_name, num_players):
        # Tests that these best responses interface well with Best Response Oracle
        game = pyspiel.load_game(
            game_name, {"players": pyspiel.GameParameter(num_players)})
        all_states, _ = best_response.compute_states_and_info_states_if_none(
            game, all_states=None, state_to_information_state=None)

        current_best = [[policy.TabularPolicy(game).__copy__()]
                        for _ in range(num_players)]
        probabilities_of_playing_policies = [[1.] for _ in range(num_players)]

        # Construct the python oracle
        py_oracle = best_response_oracle.BestResponseOracle(
            best_response_backend="py")

        # Construct the cpp oracle. Note that in this regime, BestResponseOracle
        # uses base_policy to construct and cache TabularBestResponse internally.
        cpp_oracle = best_response_oracle.BestResponseOracle(
            game=game, best_response_backend="cpp")

        # Prepare the computation of the best responses with each backend
        # pylint:disable=g-complex-comprehension
        training_params = [[{
            "total_policies":
            current_best,
            "current_player":
            i,
            "probabilities_of_playing_policies":
            probabilities_of_playing_policies
        }] for i in range(num_players)]
        # pylint:enable=g-complex-comprehension

        py_best_rep = py_oracle(game, training_params)

        cpp_best_rep = cpp_oracle(game, training_params)

        # Compare the policies
        for state in all_states.values():
            i_player = state.current_player()
            py_dict = py_best_rep[i_player][0].action_probabilities(state)
            cpp_dict = cpp_best_rep[i_player][0].action_probabilities(state)

            for action in py_dict.keys():
                self.assertEqual(py_dict.get(action, 0.0),
                                 cpp_dict.get(action, 0.0))
            for action in cpp_dict.keys():
                self.assertEqual(py_dict.get(action, 0.0),
                                 cpp_dict.get(action, 0.0))
Example #24
0
    def __init__(self, game, alternating_updates, linear_averaging,
                 regret_matching_plus):
        # pyformat: disable
        """Initializer.

    Args:
      game: The `pyspiel.Game` to run on.
      alternating_updates: If `True`, alternating updates are performed: for
        each player, we compute and update the cumulative regrets and policies.
        In that case, and when the policy is frozen during tree traversal, the
        cache is reset after each update for one player.
        Otherwise, the update is simultaneous.
      linear_averaging: Whether to use linear averaging, i.e.
        cumulative_policy[info_state][action] += (
          iteration_number * reach_prob * action_prob)

        or not:

        cumulative_policy[info_state][action] += reach_prob * action_prob
      regret_matching_plus: Whether to use Regret Matching+:
        cumulative_regrets = max(cumulative_regrets + regrets, 0)
        or simply regret matching:
        cumulative_regrets = cumulative_regrets + regrets
    """
        # pyformat: enable
        assert game.get_type(
        ).dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, (
            "CFR requires sequential games. If you're trying to run it " +
            "on a simultaneous (or normal-form) game, please first transform it "
            + "using turn_based_simultaneous_game.")

        self._game = game
        self._num_players = game.num_players()
        self._root_node = self._game.new_initial_state()

        self._root_node = self._game.new_initial_state()

        # This is for returning the current policy and average policy to a caller
        self._current_policy = policy.TabularPolicy(game)
        self._average_policy = self._current_policy.__copy__()

        self._info_state_nodes = {}
        self._initialize_info_state_nodes(self._root_node)

        self._iteration = 0  # For possible linear-averaging.
        self._linear_averaging = linear_averaging
        self._alternating_updates = alternating_updates
        self._regret_matching_plus = regret_matching_plus
Example #25
0
  def test_identity_redundant(self):
    num_players = 2
    game = pyspiel.load_game("kuhn_poker", {"players": num_players})

    tabular_policies = [  # Policy for all players.
        policy.TabularPolicy(game, players=None)
        for player in range(num_players)]
    for player, tabular_policy in enumerate(tabular_policies):
      tabular_policy.action_probability_array[:] = 0
      tabular_policy.action_probability_array[:, player] = 1.0

    merged_tabular_policy = policy.merge_tabular_policies(
        tabular_policies, game)

    self.assertIdentityPoliciesEqual(
        tabular_policies, merged_tabular_policy, game)
Example #26
0
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False):
    xfp_solver = fictitious_play.XFPSolver(seq_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        xfp_solver.iteration()
    timing = time.time() - tick_time
    # print('done')
    # average_policies = xfp_solver.average_policy_tables()
    tabular_policy = policy_module.TabularPolicy(seq_game)
    if compute_metrics:
        nash_conv = exploitability.nash_conv(seq_game,
                                             xfp_solver.average_policy())
        average_policy_values = expected_game_score.policy_value(
            seq_game.new_initial_state(), [tabular_policy])
        return timing, tabular_policy, nash_conv, average_policy_values
    return timing, tabular_policy
def test_tabular_policy_to_csv(tmpdir):
    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    assert list(tmpdir.listdir()) == [output]
    # Check created CSV
    csv = pd.read_csv(output, index_col=0)
    # Get all states in the game at which players have to make decisions.
    states = get_all_states.get_all_states(game,
                                           depth_limit=-1,
                                           include_terminals=False,
                                           include_chance_states=False)
    assert set(csv.index.values) <= set(states.keys())
    assert len(csv.columns) == game.num_distinct_actions()
Example #28
0
  def test_identity(self):
    num_players = 2
    game = pyspiel.load_game(
        "kuhn_poker", {"players": pyspiel.GameParameter(num_players)})

    tabular_policies = [  # Policy limited to player.
        policy.TabularPolicy(game, players=(player,))
        for player in range(num_players)]
    for player, tabular_policy in enumerate(tabular_policies):
      tabular_policy.action_probability_array[:] = 0
      tabular_policy.action_probability_array[:, player] = 1.0

    merged_tabular_policy = policy.merge_tabular_policies(
        tabular_policies, game)

    self.assertIdentityPoliciesEqual(
        tabular_policies, merged_tabular_policy, game)
Example #29
0
 def _get_exploitability(self):
     tabular_policy = policy.TabularPolicy(self._game)
     for player_id in range(2):
         for info_state, state_policy in self.average_policy_tables(
         )[player_id].items():
             policy_to_update_tabular = tabular_policy.policy_for_key(
                 info_state)
             for action, probability in state_policy.items():
                 policy_to_update_tabular[action] = probability
     average_policy_values = expected_game_score.policy_value(
         self._game.new_initial_state(), [tabular_policy, tabular_policy])
     #         print("Kuhn 2P average values after %s iterations" %iters)
     #         print("P0: {}".format(average_policy_values[0]))
     #         print("P1: {}".format(average_policy_values[1]))
     exp = exploitability.exploitability(game, tabular_policy)
     print("exploitability: {}".format(exp))
     return exp
Example #30
0
  def __init__(self, game, linear_averaging=True, regret_matching_plus=True):
    # pyformat: disable
    """Initializer.

    Args:
      game: The `pyspiel.Game` to run on.
      linear_averaging: Whether to use linear averaging, i.e.
        cumulative_policy[info_state][action] += (
          iteration_number * reach_prob * action_prob)

        or not:

        cumulative_policy[info_state][action] += reach_prob * action_prob
      regret_matching_plus: Whether to use Regret Matching+:
        cumulative_regrets = max(cumulative_regrets + regrets, 0)
        or simply regret matching:
        cumulative_regrets = cumulative_regrets + regrets
    """
    # pyformat: enable
    if game.num_players() != 2:
      raise ValueError("Game {} does not have {} players.".format(game, 2))

    assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, (
        "CFR requires sequential games. If you're trying to run it " +
        "on a simultaneous (or normal-form) game, please first transform it " +
        "using turn_based_simultaneous_game.")

    self._game = game
    self._num_players = game.num_players()
    self._root_node = self._game.new_initial_state()

    self._info_state_nodes = {}
    _initialize_info_state_nodes(self._root_node, self._info_state_nodes)

    self._root_node = self._game.new_initial_state()

    # This is for returning the current policy and average policy to a caller
    self._current_policy = policy.TabularPolicy(game)
    self._average_policy = self._current_policy.__copy__()

    self._linear_averaging = linear_averaging
    self._iteration = 0  # For possible linear-averaging.

    self._regret_matching_plus = regret_matching_plus

    self._best_responses = {i: None for i in range(game.num_players())}