def test_exploitability_on_kuhn_poker_uniform_random(self):
   # NashConv of uniform random test_policy from (found on Google books):
   # https://link.springer.com/chapter/10.1007/978-3-319-75931-9_5
   game = pyspiel.load_game("kuhn_poker")
   test_policy = policy.UniformRandomPolicy(game)
   expected_nash_conv = 11 / 12
   self.assertAlmostEqual(
       exploitability.exploitability(game, test_policy),
       expected_nash_conv / 2)
Esempio n. 2
0
 def test_best_response(self, name):
   """Checks if the value of a policy computation works."""
   game = pyspiel.load_game(name)
   uniform_policy = policy.UniformRandomPolicy(game)
   dist = distribution.DistributionPolicy(game, uniform_policy)
   br_value = best_response_value.BestResponse(
       game, dist, value.TabularValueFunction(game))
   br_val = br_value(game.new_initial_state())
   self.assertAlmostEqual(br_val, 30.029387484327486)
Esempio n. 3
0
def policy_bots():
    random_policy = policy.UniformRandomPolicy(GAME)

    py_bot = PolicyBot(0, np.random.RandomState(4321), random_policy)
    cpp_bot = pyspiel.make_policy_bot(
        GAME, 1, 1234,
        policy.python_policy_to_pyspiel_policy(random_policy.to_tabular()))

    return [py_bot, cpp_bot]
Esempio n. 4
0
  def __init__(self, game):
    """Initializes the greedy policy.

    Args:
      game: The game to analyze.
    """
    self._game = game
    self._policy = policy_std.UniformRandomPolicy(self._game)
    self._fp_step = 0
Esempio n. 5
0
  def __init__(self, game):
    """Initializes the greedy policy.

    Args:
      game: The game to analyze.
    """
    self._game = game
    self._states = None  # Required to avoid attribute-error.
    self._policy = policy_std.UniformRandomPolicy(self._game)
    self._fp_step = 0
    self._states = policy_std.get_tabular_policy_states(self._game)
Esempio n. 6
0
 def test_joint_action_probabilities(self):
     """Test expected behavior of joint_action_probabilities."""
     game = pyspiel.load_game("python_iterated_prisoners_dilemma")
     uniform_policy = policy.UniformRandomPolicy(game)
     joint_action_probs = policy.joint_action_probabilities(
         game.new_initial_state(), uniform_policy)
     self.assertCountEqual(list(joint_action_probs), [
         ((0, 0), 0.25),
         ((1, 1), 0.25),
         ((1, 0), 0.25),
         ((0, 1), 0.25),
     ])
 def test_uniform_mfg_policy_conversion_to_n_player_uniform_policy(self):
     """Test conversion of uniform to uniform policy."""
     mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", {
         "time_step_length": 0.05,
         "max_num_time_step": 100
     })
     n_player_game = pyspiel.load_game("python_dynamic_routing", {
         "time_step_length": 0.05,
         "max_num_time_step": 100
     })
     mfg_derived_policy = (dynamic_routing_to_mean_field_game.
                           DerivedNPlayerPolicyFromMeanFieldPolicy(
                               n_player_game,
                               policy.UniformRandomPolicy(mfg_game)))
     derived_policy_value = expected_game_score.policy_value(
         n_player_game.new_initial_state(), mfg_derived_policy)
     uniform_policy_value = expected_game_score.policy_value(
         n_player_game.new_initial_state(),
         policy.UniformRandomPolicy(n_player_game))
     self.assertSequenceAlmostEqual(derived_policy_value,
                                    uniform_policy_value)
Esempio n. 8
0
    def test_policy_value(self, name):
        """Checks if the value of a policy computation works.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        py_value = policy_value.PolicyValue(game, dist, uniform_policy,
                                            value.TabularValueFunction(game))
        py_val = py_value(game.new_initial_state())
        self.assertAlmostEqual(py_val, 27.215850929940448)
Esempio n. 9
0
def mean_field_uniform_policy(mfg_game,
                              number_of_iterations,
                              compute_metrics=False):
    del number_of_iterations
    uniform_policy = policy_module.UniformRandomPolicy(mfg_game)
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, uniform_policy)
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            uniform_policy).value(mfg_game.new_initial_state())
        return uniform_policy, policy_value_
    return uniform_policy
Esempio n. 10
0
class CommonTest(parameterized.TestCase):
    @parameterized.parameters([
        policy.TabularPolicy(_LEDUC_POKER),
        policy.UniformRandomPolicy(_LEDUC_POKER),
        policy.FirstActionPolicy(_LEDUC_POKER),
    ])
    def test_policy_on_leduc(self, policy_object):
        test_policy_on_game(self, _LEDUC_POKER, policy_object)

    @parameterized.named_parameters([
        ("pyspiel.UniformRandom", pyspiel.UniformRandomPolicy(_LEDUC_POKER)),
    ])
    def test_cpp_policies_on_leduc(self, policy_object):
        test_policy_on_game(self, _LEDUC_POKER, policy_object)
Esempio n. 11
0
    def __init__(self,
                 best_response_backend='cpp',
                 game=None,
                 all_states=None,
                 state_to_information_state=None,
                 **kwargs):
        """Init function for the RLOracle.

    Args:
      best_response_backend: A string (either 'cpp' or 'py'), specifying the
        best response backend to use (C++ or python, respectively). The cpp
        backend should be preferred, generally, as it is significantly faster.
      game: The game on which the optimization process takes place.
      all_states: The result of calling get_all_states.get_all_states. Cached
        for improved performance.
      state_to_information_state: A dict mapping str(state) to
        state.information_state for every state in the game. Cached for improved
        performance.
      **kwargs: kwargs
    """
        super(BestResponseOracle, self).__init__(**kwargs)
        self.best_response_backend = best_response_backend
        if self.best_response_backend == 'cpp':
            # Should compute all_states and state_to_information_state only once in
            # the program, as caching them speeds up TabularBestResponse tremendously.
            self.all_states, self.state_to_information_state = (
                utils.compute_states_and_info_states_if_none(
                    game, all_states, state_to_information_state))

            policy = openspiel_policy.UniformRandomPolicy(game)

            policy_to_dict = policy_utils.policy_to_dict(
                policy, game, self.all_states, self.state_to_information_state)

            # pylint: disable=g-complex-comprehension
            # Cache TabularBestResponse for players, due to their costly construction
            # TODO(b/140426861): Use a single best-responder once the code supports
            # multiple player ids.
            self.best_response_processors = [
                pyspiel.TabularBestResponse(game, best_responder_id,
                                            policy_to_dict)
                for best_responder_id in range(game.num_players())
            ]
            self.best_responders = [
                best_response.CPPBestResponsePolicy(
                    game, i_player, policy, self.all_states,
                    self.state_to_information_state,
                    self.best_response_processors[i_player])
                for i_player in range(game.num_players())
            ]
Esempio n. 12
0
 def test_policy_at_state(self):
     game = pyspiel.load_game("tic_tac_toe")
     uniform_random_policy = policy.UniformRandomPolicy(game)
     state = game.new_initial_state()
     state.apply_action(2)
     state.apply_action(4)
     state.apply_action(6)
     state.apply_action(8)
     self.assertEqual(uniform_random_policy.action_probabilities(state), {
         0: 0.2,
         1: 0.2,
         3: 0.2,
         5: 0.2,
         7: 0.2
     })
Esempio n. 13
0
    def test_policy_aggregation_random(self, game_name):
        env = rl_environment.Environment(game_name)

        policies = [[policy.UniformRandomPolicy(env.game) for _ in range(2)]
                    for _ in range(2)]
        probabilities = [
            list(np.ones(len(policies)) / len(policies)) for _ in range(2)
        ]

        pol_ag = policy_aggregator.PolicyAggregator(env.game)
        aggr_policy = pol_ag.aggregate([0], policies, probabilities)

        for item in aggr_policy.policy[0].items():
            _, probs = zip(*item[1].items())
            const_probs = tuple([probs[0]] * len(probs))
            self.assertEqual(probs, const_probs)
Esempio n. 14
0
 def test_best_response_is_a_policy(self):
   game = pyspiel.load_game("kuhn_poker")
   test_policy = policy.UniformRandomPolicy(game)
   br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0)
   expected_policy = {
       "0": 1,  # Bet in case opponent folds when winning
       "1": 1,  # Bet in case opponent folds when winning
       "2": 0,  # Both equally good (we return the lowest action)
       # Some of these will never happen under the best-response policy,
       # but we have computed best-response actions anyway.
       "0pb": 0,  # Fold - we're losing
       "1pb": 1,  # Call - we're 50-50
       "2pb": 1,  # Call - we've won
   }
   self.assertEqual(
       expected_policy,
       {key: br.best_response_action(key) for key in expected_policy.keys()})
Esempio n. 15
0
    def test_greedy_cpp(self):
        """Check if the greedy policy works as expected.

    The test checks that a greedy policy with respect to an optimal value is
    an optimal policy.
    """
        game = pyspiel.load_game("mfg_crowd_modelling")
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(game, dist)
        br_val = br_value(game.new_initial_state())

        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        pybr_value = policy_value.PolicyValue(game, dist, greedy_pi)
        pybr_val = pybr_value(game.new_initial_state())
        self.assertAlmostEqual(br_val, pybr_val)
 def test_kuhn_poker_uniform_random_best_response_pid0(self):
     game = pyspiel.load_game("kuhn_poker")
     test_policy = policy.UniformRandomPolicy(game)
     results = exploitability.best_response(game, test_policy, player_id=0)
     self.assertEqual(
         results["best_response_action"],
         {
             "0": 1,  # Bet in case opponent folds when winning
             "1": 1,  # Bet in case opponent folds when winning
             "2": 0,  # Both equally good (we return the lowest action)
             # Some of these will never happen under the best-response policy,
             # but we have computed best-response actions anyway.
             "0pb": 0,  # Fold - we're losing
             "1pb": 1,  # Call - we're 50-50
             "2pb": 1,  # Call - we've won
         })
     self.assertGreater(results["nash_conv"], 0.1)
 def test_kuhn_poker_uniform_random_best_response_pid1(self):
     game = pyspiel.load_game("kuhn_poker")
     test_policy = policy.UniformRandomPolicy(game)
     results = exploitability.best_response(game, test_policy, player_id=1)
     self.assertEqual(
         results["best_response_action"],
         {
             # Bet is always best
             "0p": 1,
             "1p": 1,
             "2p": 1,
             # Call unless we know we're beaten
             "0b": 0,
             "1b": 1,
             "2b": 1,
         })
     self.assertGreater(results["nash_conv"], 0.1)
Esempio n. 18
0
 def test_kuhn_poker_uniform(self):
     game = pyspiel.load_game("kuhn_poker")
     calc = action_value_vs_best_response.Calculator(game)
     expl, avvbr, cfrp = calc(0, policy.UniformRandomPolicy(game),
                              ["0", "1", "2", "0pb", "1pb", "2pb"])
     self.assertAlmostEqual(expl, 15 / 36)
     np.testing.assert_allclose(
         avvbr,
         [
             [-1.5, -2.0],  # 0 (better to pass)
             [-0.5, -0.5],  # 1 (same)
             [0.5, 1.5],  # 2 (better to bet)
             [-1.0, -2.0],  # 0pb - losing
             [-1.0, 0.0],  # 1pb - best response is bet always
             [-1.0, 2.0],  # 2pb - winning
         ])
     np.testing.assert_allclose(cfrp,
                                [1 / 3, 1 / 3, 1 / 3, 1 / 3, 1 / 3, 1 / 3])
Esempio n. 19
0
 def test_players_have_different_legal_actions(self):
     game = pyspiel.load_game("oshi_zumo")
     uniform_random_policy = policy.UniformRandomPolicy(game)
     state = game.new_initial_state()
     state.apply_actions([46, 49])
     # Started with 50 coins each, now have 4 and 1 respectively
     self.assertEqual(
         uniform_random_policy.action_probabilities(state, player_id=0), {
             0: 0.2,
             1: 0.2,
             2: 0.2,
             3: 0.2,
             4: 0.2
         })
     self.assertEqual(
         uniform_random_policy.action_probabilities(state, player_id=1), {
             0: 0.5,
             1: 0.5
         })
Esempio n. 20
0
    def test_rl_environment(self, game_name):
        """Check that the RL environment runs for a few trajectories."""
        game = pyspiel.load_game(game_name)
        uniform_policy = policy.UniformRandomPolicy(game)
        mfg_dist = distribution.DistributionPolicy(game, uniform_policy)

        envs = [
            rl_environment.Environment(game,
                                       distribution=mfg_dist,
                                       mfg_population=p)
            for p in range(game.num_players())
        ]
        for p, env in enumerate(envs):
            for _ in range(FLAGS.rl_env_simulations):
                time_step = env.reset()
                while not time_step.last():
                    print(time_step)
                    a = random.choice(
                        time_step.observations['legal_actions'][p])
                    time_step = env.step([a])
Esempio n. 21
0
    def test_best_response_prisoner_dilemma_simultaneous_game(self):
        """Test best response computation for simultaneous game."""
        game = pyspiel.load_game(
            "python_iterated_prisoners_dilemma(max_game_length=5)")
        test_policy = policy.UniformRandomPolicy(game)
        br = best_response.BestResponsePolicy(game,
                                              policy=test_policy,
                                              player_id=0)

        # Best policy is always to defect; we verify this for a handful of states
        self.assertEqual(br.best_response_action("us:CCCC op:CCCC"), 1)
        self.assertEqual(br.best_response_action("us:DDDD op:CCCC"), 1)
        self.assertEqual(br.best_response_action("us:CDCD op:DCDC"), 1)
        self.assertEqual(br.best_response_action("us:CCCC op:DDDD"), 1)

        # Expected value per turn = 5.5 (avg of 1 and 10)
        # Expected game length = sum(0.875**i for i in range(5)) = 3.896728515625
        # Game value = 5.5 * 3.896728515625 = 21.4320068359375
        self.assertAlmostEqual(br.value(game.new_initial_state()),
                               21.4320068359375)
Esempio n. 22
0
    def test_average(self):
        """Test the average of policies.

    Here we test that the average of values is the value of the average policy.
    """
        game = crowd_modelling.MFGCrowdModellingGame()
        uniform_policy = policy.UniformRandomPolicy(game)
        mfg_dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(game, mfg_dist)
        py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy)
        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        merged_pi = fictitious_play.MergedPolicy(
            game, list(range(game.num_players())), [uniform_policy, greedy_pi],
            [mfg_dist,
             distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5])
        merged_pi_value = policy_value.PolicyValue(game, mfg_dist, merged_pi)

        self.assertAlmostEqual(merged_pi_value(game.new_initial_state()),
                               (br_value(game.new_initial_state()) +
                                py_value(game.new_initial_state())) / 2)
Esempio n. 23
0
    def test_cpp_and_python_implementations_are_identical(self, game_name):
        game = pyspiel.load_game(game_name)

        policy = openspiel_policy.UniformRandomPolicy(game)

        all_states = get_all_states.get_all_states(
            game,
            depth_limit=-1,
            include_terminals=False,
            include_chance_states=False,
            to_string=lambda s: s.information_state_string())

        for current_player in range(game.num_players()):
            noise = noisy_policy.NoisyPolicy(policy, 0, alpha=0.5, beta=10.)
            for state in all_states.values():
                if state.current_player() != current_player:
                    continue

                # TODO(b/141737795): Decide what to do about this.
                self.assertNotEqual(policy.action_probabilities(state),
                                    noise.action_probabilities(state))
Esempio n. 24
0
  def __init__(self, game, lr=0.01, root_state=None):
    """Initializes mirror descent.

    Args:
      game: The game,
      lr: The learning rate of mirror descent,
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
    self._game = game
    if root_state is None:
      self._root_states = game.new_initial_states()
    else:
      self._root_states = [root_state]
    self._policy = policy_std.UniformRandomPolicy(game)
    self._distribution = distribution.DistributionPolicy(game, self._policy)
    self._md_step = 0
    self._lr = lr

    self._state_value = collections.defaultdict(float)
    self._cumulative_state_value = collections.defaultdict(float)
Esempio n. 25
0
 def test_best_response_oshi_zumo_simultaneous_game(self):
     """Test best response computation for simultaneous game."""
     game = pyspiel.load_game("oshi_zumo(horizon=5,coins=5)")
     test_policy = policy.UniformRandomPolicy(game)
     br = best_response.BestResponsePolicy(game,
                                           policy=test_policy,
                                           player_id=0)
     expected_policy = {
         "0, 0, 0, 3, 0, 2": 1,
         "0, 0, 1, 4, 3, 1": 0,
         "0, 0, 4, 1, 0, 2, 0, 2": 1,
         "0, 1, 1, 0, 1, 4": 1,
         "0, 1, 4, 1, 0, 0, 0, 1": 1,
         "0, 2, 2, 2, 3, 0, 0, 0": 0,
         "0, 5, 0, 0, 0, 0, 3, 0": 1
     }
     self.assertEqual(
         expected_policy,
         {key: br.best_response_action(key)
          for key in expected_policy})
     self.assertAlmostEqual(br.value(game.new_initial_state()),
                            0.856471051954)
Esempio n. 26
0
    def test_greedy(self, name):
        """Check if the greedy policy works as expected.

    The test checks that a greedy policy with respect to an optimal value is
    an optimal policy.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(
            game, dist, value.TabularValueFunction(game))
        br_val = br_value(game.new_initial_state())

        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        pybr_value = policy_value.PolicyValue(game, dist, greedy_pi,
                                              value.TabularValueFunction(game))
        pybr_val = pybr_value(game.new_initial_state())
        self.assertAlmostEqual(br_val, pybr_val)
    def test_policy_aggregation_random(self, game_name):
        env = rl_environment.Environment(game_name)
        num_players = 2
        num_joint_policies = 4

        joint_policies = [[
            policy.UniformRandomPolicy(env.game) for _ in range(num_players)
        ] for _ in range(num_joint_policies)]
        probabilities = np.ones(len(joint_policies))
        probabilities /= np.sum(probabilities)

        pol_ag = policy_aggregator_joint.JointPolicyAggregator(env.game)
        aggr_policy = pol_ag.aggregate([0, 1], joint_policies, probabilities)

        self.assertLen(aggr_policy.policies, num_players)
        for player in range(num_players):
            player_policy = aggr_policy.policies[player]
            self.assertNotEmpty(player_policy)
            for state_action_probs in player_policy.values():
                probs = list(state_action_probs.values())
                expected_prob = 1. / len(probs)
                for prob in probs:
                    self.assertEqual(expected_prob, prob)
Esempio n. 28
0
 def test_joint_action_probabilities_failure_on_seq_game(self):
   """Test failure of child on sequential games."""
   game = pyspiel.load_game("kuhn_poker")
   with self.assertRaises(AssertionError):
     list(policy.joint_action_probabilities(
         game.new_initial_state(), policy.UniformRandomPolicy(game)))
def main(_):
  game = pyspiel.load_game(FLAGS.game)
  expl = exploitability.exploitability(game, policy.UniformRandomPolicy(game))
  print("Exploitability: {}".format(expl))
Esempio n. 30
0
 def test_policy_attributes(self):
     game = pyspiel.load_game("tiny_bridge_4p")
     uniform_random_policy = policy.UniformRandomPolicy(game)
     self.assertEqual(uniform_random_policy.player_ids, [0, 1, 2, 3])