コード例 #1
0
 def test_int_mccfr_on_turn_based_game_with_exploitability(self):
   """Check if outcome sampling MCCFR can be applied."""
   game = pyspiel.load_game(
       "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)")
   seq_game = pyspiel.convert_to_turn_based(game)
   cfr_solver = outcome_mccfr.OutcomeSamplingSolver(seq_game)
   for _ in range(_NUM_ITERATION_CFR_TEST):
     cfr_solver.iteration()
   exploitability.nash_conv(seq_game, cfr_solver.average_policy())
コード例 #2
0
 def test_cfr_on_turn_based_game_with_exploitability(self):
   """Check if CFR can be applied to the sequential game."""
   game = pyspiel.load_game(
       "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)")
   seq_game = pyspiel.convert_to_turn_based(game)
   cfr_solver = cfr.CFRSolver(seq_game)
   for _ in range(_NUM_ITERATION_CFR_TEST):
     cfr_solver.evaluate_and_update_policy()
   exploitability.nash_conv(seq_game, cfr_solver.average_policy())
コード例 #3
0
 def test_outcome_sampling_kuhn_2p(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker")
     os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
     for _ in range(10000):
         os_solver.iteration()
     conv = exploitability.nash_conv(game, os_solver.average_policy())
     print("Kuhn2P, conv = {}".format(conv))
     self.assertLess(conv, 0.17)
     # ensure that to_tabular() works on the returned policy
     # and the tabular policy is equivalent
     tabular_policy = os_solver.average_policy().to_tabular()
     conv2 = exploitability.nash_conv(game, tabular_policy)
     self.assertEqual(conv, conv2)
コード例 #4
0
 def test_external_sampling_leduc_2p_simple(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("leduc_poker")
     es_solver = external_sampling_mccfr.ExternalSamplingSolver(
         game, external_sampling_mccfr.AverageType.SIMPLE)
     for _ in range(10):
         es_solver.iteration()
     conv = exploitability.nash_conv(game, es_solver.average_policy())
     print("Leduc2P, conv = {}".format(conv))
     self.assertLess(conv, 5)
     # ensure that to_tabular() works on the returned policy and
     # the tabular policy is equivalent
     tabular_policy = es_solver.average_policy().to_tabular()
     conv2 = exploitability.nash_conv(game, tabular_policy)
     self.assertEqual(conv, conv2)
コード例 #5
0
def print_algorithm_results(game, callable_policy, algorithm_name):
    print(algorithm_name.upper())
    tabular_policy = tabular_policy_from_callable(game, callable_policy)
    policy_exploitability = exploitability(game, tabular_policy)
    policy_nashconv = nash_conv(game, tabular_policy)
    print("exploitability = {}".format(policy_exploitability))
    print("nashconv = {}".format(policy_nashconv))
コード例 #6
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    with tf.Session() as sess:
        deep_cfr_solver = deep_cfr.DeepCFRSolver(
            sess,
            game,
            policy_network_layers=(32, 32),
            advantage_network_layers=(16, 16),
            num_iterations=FLAGS.num_iterations,
            num_traversals=FLAGS.num_traversals,
            learning_rate=1e-3,
            batch_size_advantage=None,
            batch_size_strategy=None,
            memory_capacity=1e7)
        sess.run(tf.global_variables_initializer())
        _, advantage_losses, policy_loss = deep_cfr_solver.solve()
        for player, losses in six.iteritems(advantage_losses):
            logging.info("Advantage for player %d: %s", player,
                         losses[:2] + ["..."] + losses[-2:])
            logging.info("Advantage Buffer Size for player %s: '%s'", player,
                         len(deep_cfr_solver.advantage_buffers[player]))
        logging.info("Strategy Buffer Size: '%s'",
                     len(deep_cfr_solver.strategy_buffer))
        logging.info("Final policy loss: '%s'", policy_loss)
        conv = exploitability.nash_conv(
            game,
            policy.PolicyFromCallable(game,
                                      deep_cfr_solver.action_probabilities))
        logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)
コード例 #7
0
 def test_kuhn_poker_uniform_random(self):
     # NashConv of uniform random test_policy from (found on Google books):
     # https://link.springer.com/chapter/10.1007/978-3-319-75931-9_5
     game = pyspiel.load_game("kuhn_poker")
     test_policy = policy.UniformRandomPolicy(game)
     self.assertAlmostEqual(exploitability.nash_conv(game, test_policy),
                            11 / 12)
コード例 #8
0
 def test_shapleys_game(self):
   game = pyspiel.load_game_as_turn_based("matrix_shapleys_game")
   xfp_solver = fictitious_play.XFPSolver(game)
   for i in range(1000):
     xfp_solver.iteration()
     if i % 10 == 0:
       conv = exploitability.nash_conv(game, xfp_solver.average_policy())
       print("FP in Shapley's Game. Iter: {}, NashConv: {}".format(i, conv))
コード例 #9
0
def get_algo_metrics(algo_policies, game):
    print("Extracting metrics...")
    algo_exploitabilities = {}
    algo_nashconvs = {}
    for key in algo_policies:
        algo_exploitabilities[key] = exploitability(game, algo_policies[key])
        algo_nashconvs[key] = nash_conv(game, algo_policies[key])
    return algo_exploitabilities, algo_nashconvs
コード例 #10
0
def gpsro_looper(env, oracle, agents):
    """Initializes and executes the GPSRO training loop."""
    sample_from_marginals = True  # TODO(somidshafiei) set False for alpharank
    training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector

    if FLAGS.meta_strategy_method == "alpharank":
        # TODO(somidshafiei): Implement epsilon-sweep for Openspiel alpharank.
        print("\n")
        print(
            "==================================================================\n"
            "============================ Warning =============================\n"
            "==================================================================\n"
        )
        print(
            "Selected alpharank. Warning : Current alpharank version is unstable."
            " It can raise errors because of infinite / nans elements in arrays. "
            "A fix should be uploaded in upcoming openspiel iterations.")
        print("\n")
    g_psro_solver = psro_v2.PSROSolver(
        env.game,
        oracle,
        initial_policies=agents,
        training_strategy_selector=training_strategy_selector,
        rectifier=FLAGS.rectifier,
        sims_per_entry=FLAGS.sims_per_entry,
        number_policies_selected=FLAGS.number_policies_selected,
        meta_strategy_method=FLAGS.meta_strategy_method,
        prd_iterations=50000,
        prd_gamma=1e-10,
        sample_from_marginals=sample_from_marginals,
        symmetric_game=FLAGS.symmetric_game)

    start_time = time.time()
    for gpsro_iteration in range(FLAGS.gpsro_iterations):
        if FLAGS.verbose:
            print("Iteration : {}".format(gpsro_iteration))
            print("Time so far: {}".format(time.time() - start_time))
        g_psro_solver.iteration()
        meta_game = g_psro_solver.get_meta_game()
        meta_probabilities = g_psro_solver.get_meta_strategies()
        policies = g_psro_solver.get_policies()

        if FLAGS.verbose:
            print("Meta game : {}".format(meta_game))
            print("Probabilities : {}".format(meta_probabilities))

        aggregator = policy_aggregator.PolicyAggregator(env.game)
        aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies,
                                             meta_probabilities)

        exploitabilities, expl_per_player = exploitability.nash_conv(
            env.game, aggr_policies, return_only_nash_conv=False)

        _ = print_policy_analysis(policies, env.game, FLAGS.verbose)
        if FLAGS.verbose:
            print("Exploitabilities : {}".format(exploitabilities))
            print("Exploitabilities per player : {}".format(expl_per_player))
コード例 #11
0
 def test_outcome_sampling_kuhn_3p(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker", {"players": 3})
     os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
     for _ in range(10000):
         os_solver.iteration()
     conv = exploitability.nash_conv(game, os_solver.average_policy())
     print("Kuhn3P, conv = {}".format(conv))
     self.assertLess(conv, 0.22)
コード例 #12
0
 def test_matching_pennies_3p(self):
   game = pyspiel.load_game_as_turn_based("matching_pennies_3p")
   xfp_solver = fictitious_play.XFPSolver(game)
   for i in range(1000):
     xfp_solver.iteration()
     if i % 10 == 0:
       conv = exploitability.nash_conv(game, xfp_solver.average_policy())
       print("FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format(
           i, conv))
コード例 #13
0
 def disabled_test_external_sampling_liars_dice_2p_simple(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("liars_dice")
     es_solver = external_sampling_mccfr.ExternalSamplingSolver(
         game, external_sampling_mccfr.AverageType.SIMPLE)
     for _ in range(1):
         es_solver.iteration()
     conv = exploitability.nash_conv(game, es_solver.average_policy())
     print("Liar's dice, conv = {}".format(conv))
     self.assertLess(conv, 2)
コード例 #14
0
 def test_external_sampling_kuhn_3p_simple(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker", {"players": 3})
     es_solver = external_sampling_mccfr.ExternalSamplingSolver(
         game, external_sampling_mccfr.AverageType.SIMPLE)
     for _ in range(10):
         es_solver.iteration()
     conv = exploitability.nash_conv(game, es_solver.average_policy())
     print("Kuhn3P, conv = {}".format(conv))
     self.assertLess(conv, 2)
コード例 #15
0
    def test_outcome_sampling_leduc_2p(self):
        np.random.seed(SEED)
        game = pyspiel.load_game("leduc_poker")
        os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
        for _ in range(10000):
            os_solver.iteration()
        conv = exploitability.nash_conv(game, os_solver.average_policy())
        print("Leduc2P, conv = {}".format(conv))

        self.assertLess(conv, 3.07)
コード例 #16
0
    def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv(
            self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})

        # TabularPolicy defaults to being a uniform random policy.
        test_policy = policy.TabularPolicy(game)
        python_nash_conv = exploitability.nash_conv(game, test_policy)
        cpp_nash_conv = pyspiel.nash_conv(
            game, policy_utils.policy_to_dict(test_policy, game))
        self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
コード例 #17
0
 def test_external_sampling_kuhn_2p_full(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker")
     es_solver = external_sampling_mccfr.ExternalSamplingSolver(
         game, external_sampling_mccfr.AverageType.FULL)
     for _ in range(10):
         es_solver.iteration()
     conv = exploitability.nash_conv(game, es_solver.average_policy())
     print("Kuhn2P, conv = {}".format(conv))
     self.assertLess(conv, 1)
コード例 #18
0
 def test_cpp_python_cfr_kuhn(self):
     game = pyspiel.load_game("kuhn_poker")
     solver = pyspiel.CFRSolver(game)
     for _ in range(100):
         solver.evaluate_and_update_policy()
     pyspiel_average_policy = solver.tabular_average_policy()
     cpp_nash_conv = pyspiel.nash_conv(game, pyspiel_average_policy)
     python_policy = policy.pyspiel_policy_to_python_policy(
         game, pyspiel_average_policy)
     python_nash_conv = exploitability.nash_conv(game, python_policy)
     self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
コード例 #19
0
 def test_outcome_sampling_kuhn_2p(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker")
     os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
     for _ in range(1000):
         os_solver.iteration()
     conv = exploitability.nash_conv(
         game,
         policy.PolicyFromCallable(game, os_solver.callable_avg_policy()))
     print("Kuhn2P, conv = {}".format(conv))
     self.assertGreater(conv, 0.2)
     self.assertLess(conv, 0.3)
コード例 #20
0
ファイル: mccfr_example.py プロジェクト: ngrupen/open_spiel
def main(_):
    game = pyspiel.load_game(FLAGS.game, {"players": FLAGS.players})
    if FLAGS.sampling == "external":
        cfr_solver = external_mccfr.ExternalSamplingSolver(
            game, external_mccfr.AverageType.SIMPLE)
    else:
        cfr_solver = outcome_mccfr.OutcomeSamplingSolver(game)
    for i in range(FLAGS.iterations):
        cfr_solver.iteration()
        if i % FLAGS.print_freq == 0:
            conv = exploitability.nash_conv(game, cfr_solver.average_policy())
            print("Iteration {} exploitability {}".format(i, conv))
コード例 #21
0
 def test_outcome_sampling_leduc_2p(self):
   np.random.seed(SEED)
   game = pyspiel.load_game("leduc_poker")
   os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
   for _ in range(1000):
     os_solver.iteration()
   conv = exploitability.nash_conv(
       game,
       policy.tabular_policy_from_callable(game,
                                           os_solver.callable_avg_policy()))
   print("Leduc2P, conv = {}".format(conv))
   self.assertGreater(conv, 4.5)
   self.assertLess(conv, 4.6)
 def test_external_sampling_kuhn_2p_simple(self):
   np.random.seed(SEED)
   game = pyspiel.load_game("kuhn_poker")
   es_solver = external_sampling_mccfr.ExternalSamplingSolver(
       game, external_sampling_mccfr.AverageType.SIMPLE)
   for _ in range(10):
     es_solver.iteration()
   conv = exploitability.nash_conv(
       game,
       policy.tabular_policy_from_callable(game,
                                           es_solver.callable_avg_policy()))
   print("Kuhn2P, conv = {}".format(conv))
   self.assertLess(conv, 1)
コード例 #23
0
ファイル: kuhn.py プロジェクト: onon6/ML_Project
def main(unused_argv):
    game = pyspiel.load_game("kuhn_poker")
    cfr_solver = cfr.CFRSolver(game)

    episodes = []
    exploits = []
    nashes = []

    # Train the agent for a specific amount of episodes
    for ep in range(FLAGS.num_train_episodes):
        print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes))
        cfr_solver.evaluate_and_update_policy()
        avg_pol = cfr_solver.average_policy()

        # Calculate the exploitability and nash convergence
        expl = exploitability.exploitability(game, avg_pol)
        nash = exploitability.nash_conv(game, avg_pol)

        exploits.append(expl)
        nashes.append(nash)
        episodes.append(ep)

    # Get the average policy
    average_policy = cfr_solver.average_policy()
    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    cur_pol = cfr_solver.current_policy()

    # Plot the exploitability
    plt.plot(episodes, exploits, "-r", label="Exploitability")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_expl.png")

    plt.figure()

    # Plot the nash convergence
    plt.plot(episodes, nashes, "-r", label="NashConv")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_nash.png")

    print(average_policy)
    print(average_policy_values)
    policy_to_csv(game, average_policy, "./kuhn_policy.csv")
 def test_external_sampling_kuhn_3p_full(self):
   np.random.seed(SEED)
   game = pyspiel.load_game("kuhn_poker",
                            {"players": pyspiel.GameParameter(3)})
   es_solver = external_sampling_mccfr.ExternalSamplingSolver(
       game, external_sampling_mccfr.AverageType.FULL)
   for _ in range(10):
     es_solver.iteration()
   conv = exploitability.nash_conv(
       game,
       policy.tabular_policy_from_callable(game,
                                           es_solver.callable_avg_policy()))
   print("Kuhn3P, conv = {}".format(conv))
   self.assertLess(conv, 2)
コード例 #25
0
 def test_outcome_sampling_kuhn_3p(self):
   np.random.seed(SEED)
   game = pyspiel.load_game("kuhn_poker",
                            {"players": pyspiel.GameParameter(3)})
   os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
   for _ in range(1000):
     os_solver.iteration()
   conv = exploitability.nash_conv(
       game,
       policy.tabular_policy_from_callable(game,
                                           os_solver.callable_avg_policy()))
   print("Kuhn3P, conv = {}".format(conv))
   self.assertGreater(conv, 0.3)
   self.assertLess(conv, 0.4)
コード例 #26
0
  def test_cpp_and_python_cfr_br(self, game, solver_cls,
                                 expected_exploitability):
    solver = solver_cls(game)
    for step in range(5):
      solver.evaluate_and_update_policy()

      # We do not compare the policy directly as we do not have an easy way to
      # convert one to the other, so we use the exploitability as a proxy.
      avg_policy = solver.average_policy()
      if solver_cls == pyspiel.CFRBRSolver:
        exploitability_ = pyspiel.nash_conv(game, avg_policy)
      else:
        exploitability_ = exploitability.nash_conv(game, avg_policy)

      self.assertEqual(expected_exploitability[step], exploitability_)
コード例 #27
0
    def test_cpp_algorithms_identical_to_python_algorithm(
            self, game, cpp_class, python_class):
        cpp_solver = cpp_class(game)
        python_solver = python_class(game)

        for _ in range(5):
            cpp_solver.evaluate_and_update_policy()
            python_solver.evaluate_and_update_policy()

            cpp_avg_policy = cpp_solver.average_policy()
            python_avg_policy = python_solver.average_policy()

            # We do not compare the policy directly as we do not have an easy way to
            # convert one to the other, so we use the exploitability as a proxy.
            cpp_expl = pyspiel.nash_conv(game, cpp_avg_policy)
            python_expl = exploitability.nash_conv(game, python_avg_policy)
            self.assertEqual(cpp_expl, python_expl)
        # Then we also check the CurrentPolicy, just to check it is giving the same
        # results too
        cpp_current_policy = cpp_solver.current_policy()
        python_current_policy = python_solver.current_policy()
        cpp_expl = pyspiel.nash_conv(game, cpp_current_policy)
        python_expl = exploitability.nash_conv(game, python_current_policy)
        self.assertEqual(cpp_expl, python_expl)
コード例 #28
0
def external_sampling_monte_carlo_counterfactual_regret_minimization(
        seq_game, number_of_iterations, compute_metrics=False):
    cfr_solver = external_mccfr.ExternalSamplingSolver(
        seq_game, external_mccfr.AverageType.SIMPLE)
    tick_time = time.time()
    # print("CFRSolver initialized.")
    for _ in range(number_of_iterations):
        cfr_solver.iteration()
    timing = time.time() - tick_time
    # print("Finish.")
    if compute_metrics:
        nash_conv = exploitability.nash_conv(seq_game,
                                             cfr_solver.average_policy())
        return timing, cfr_solver.average_policy(), nash_conv
    return timing, cfr_solver.average_policy()
コード例 #29
0
def gpsro_looper(env, oracle, agents):
    """Initializes and executes the GPSRO training loop."""
    sample_from_marginals = True  # TODO(somidshafiei) set False for alpharank
    training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector

    g_psro_solver = psro_v2.PSROSolver(
        env.game,
        oracle,
        initial_policies=agents,
        training_strategy_selector=training_strategy_selector,
        rectifier=FLAGS.rectifier,
        sims_per_entry=FLAGS.sims_per_entry,
        number_policies_selected=FLAGS.number_policies_selected,
        meta_strategy_method=FLAGS.meta_strategy_method,
        prd_iterations=50000,
        prd_gamma=1e-10,
        sample_from_marginals=sample_from_marginals,
        symmetric_game=FLAGS.symmetric_game)

    start_time = time.time()
    for gpsro_iteration in range(FLAGS.gpsro_iterations):
        if FLAGS.verbose:
            print("Iteration : {}".format(gpsro_iteration))
            print("Time so far: {}".format(time.time() - start_time))
        g_psro_solver.iteration()
        meta_game = g_psro_solver.get_meta_game()
        meta_probabilities = g_psro_solver.get_meta_strategies()
        policies = g_psro_solver.get_policies()

        if FLAGS.verbose:
            print("Meta game : {}".format(meta_game))
            print("Probabilities : {}".format(meta_probabilities))

        # The following lines only work for sequential games for the moment.
        if env.game.get_type(
        ).dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL:
            aggregator = policy_aggregator.PolicyAggregator(env.game)
            aggr_policies = aggregator.aggregate(range(FLAGS.n_players),
                                                 policies, meta_probabilities)

            exploitabilities, expl_per_player = exploitability.nash_conv(
                env.game, aggr_policies, return_only_nash_conv=False)

            _ = print_policy_analysis(policies, env.game, FLAGS.verbose)
            if FLAGS.verbose:
                print("Exploitabilities : {}".format(exploitabilities))
                print(
                    "Exploitabilities per player : {}".format(expl_per_player))
コード例 #30
0
 def test_2p_nash_conv(self):
   # Note: The first action test_policy is "AlwaysFold".
   kuhn_poker = pyspiel.load_game("kuhn_poker")
   leduc_poker = pyspiel.load_game("leduc_poker")
   test_parameters = [
       (kuhn_poker, policy.UniformRandomPolicy(kuhn_poker),
        0.9166666666666666),
       (kuhn_poker, policy.FirstActionPolicy(kuhn_poker), 2.),
       (kuhn_poker, data.kuhn_nash_equilibrium(alpha=0.2), 0.),
       (leduc_poker, policy.FirstActionPolicy(leduc_poker), 2.),
       (leduc_poker, policy.UniformRandomPolicy(leduc_poker),
        4.747222222222222),
   ]
   for game, test_test_policy, expected_value in test_parameters:
     self.assertAlmostEqual(
         exploitability.nash_conv(game, test_test_policy), expected_value)