def test_int_mccfr_on_turn_based_game_with_exploitability(self): """Check if outcome sampling MCCFR can be applied.""" game = pyspiel.load_game( "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)") seq_game = pyspiel.convert_to_turn_based(game) cfr_solver = outcome_mccfr.OutcomeSamplingSolver(seq_game) for _ in range(_NUM_ITERATION_CFR_TEST): cfr_solver.iteration() exploitability.nash_conv(seq_game, cfr_solver.average_policy())
def test_cfr_on_turn_based_game_with_exploitability(self): """Check if CFR can be applied to the sequential game.""" game = pyspiel.load_game( "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)") seq_game = pyspiel.convert_to_turn_based(game) cfr_solver = cfr.CFRSolver(seq_game) for _ in range(_NUM_ITERATION_CFR_TEST): cfr_solver.evaluate_and_update_policy() exploitability.nash_conv(seq_game, cfr_solver.average_policy())
def test_outcome_sampling_kuhn_2p(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker") os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(10000): os_solver.iteration() conv = exploitability.nash_conv(game, os_solver.average_policy()) print("Kuhn2P, conv = {}".format(conv)) self.assertLess(conv, 0.17) # ensure that to_tabular() works on the returned policy # and the tabular policy is equivalent tabular_policy = os_solver.average_policy().to_tabular() conv2 = exploitability.nash_conv(game, tabular_policy) self.assertEqual(conv, conv2)
def test_external_sampling_leduc_2p_simple(self): np.random.seed(SEED) game = pyspiel.load_game("leduc_poker") es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.SIMPLE) for _ in range(10): es_solver.iteration() conv = exploitability.nash_conv(game, es_solver.average_policy()) print("Leduc2P, conv = {}".format(conv)) self.assertLess(conv, 5) # ensure that to_tabular() works on the returned policy and # the tabular policy is equivalent tabular_policy = es_solver.average_policy().to_tabular() conv2 = exploitability.nash_conv(game, tabular_policy) self.assertEqual(conv, conv2)
def print_algorithm_results(game, callable_policy, algorithm_name): print(algorithm_name.upper()) tabular_policy = tabular_policy_from_callable(game, callable_policy) policy_exploitability = exploitability(game, tabular_policy) policy_nashconv = nash_conv(game, tabular_policy) print("exploitability = {}".format(policy_exploitability)) print("nashconv = {}".format(policy_nashconv))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, deep_cfr_solver.action_probabilities)) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)
def test_kuhn_poker_uniform_random(self): # NashConv of uniform random test_policy from (found on Google books): # https://link.springer.com/chapter/10.1007/978-3-319-75931-9_5 game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) self.assertAlmostEqual(exploitability.nash_conv(game, test_policy), 11 / 12)
def test_shapleys_game(self): game = pyspiel.load_game_as_turn_based("matrix_shapleys_game") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv(game, xfp_solver.average_policy()) print("FP in Shapley's Game. Iter: {}, NashConv: {}".format(i, conv))
def get_algo_metrics(algo_policies, game): print("Extracting metrics...") algo_exploitabilities = {} algo_nashconvs = {} for key in algo_policies: algo_exploitabilities[key] = exploitability(game, algo_policies[key]) algo_nashconvs[key] = nash_conv(game, algo_policies[key]) return algo_exploitabilities, algo_nashconvs
def gpsro_looper(env, oracle, agents): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector if FLAGS.meta_strategy_method == "alpharank": # TODO(somidshafiei): Implement epsilon-sweep for Openspiel alpharank. print("\n") print( "==================================================================\n" "============================ Warning =============================\n" "==================================================================\n" ) print( "Selected alpharank. Warning : Current alpharank version is unstable." " It can raise errors because of infinite / nans elements in arrays. " "A fix should be uploaded in upcoming openspiel iterations.") print("\n") g_psro_solver = psro_v2.PSROSolver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game) start_time = time.time() for gpsro_iteration in range(FLAGS.gpsro_iterations): if FLAGS.verbose: print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) g_psro_solver.iteration() meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) _ = print_policy_analysis(policies, env.game, FLAGS.verbose) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print("Exploitabilities per player : {}".format(expl_per_player))
def test_outcome_sampling_kuhn_3p(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker", {"players": 3}) os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(10000): os_solver.iteration() conv = exploitability.nash_conv(game, os_solver.average_policy()) print("Kuhn3P, conv = {}".format(conv)) self.assertLess(conv, 0.22)
def test_matching_pennies_3p(self): game = pyspiel.load_game_as_turn_based("matching_pennies_3p") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv(game, xfp_solver.average_policy()) print("FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format( i, conv))
def disabled_test_external_sampling_liars_dice_2p_simple(self): np.random.seed(SEED) game = pyspiel.load_game("liars_dice") es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.SIMPLE) for _ in range(1): es_solver.iteration() conv = exploitability.nash_conv(game, es_solver.average_policy()) print("Liar's dice, conv = {}".format(conv)) self.assertLess(conv, 2)
def test_external_sampling_kuhn_3p_simple(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker", {"players": 3}) es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.SIMPLE) for _ in range(10): es_solver.iteration() conv = exploitability.nash_conv(game, es_solver.average_policy()) print("Kuhn3P, conv = {}".format(conv)) self.assertLess(conv, 2)
def test_outcome_sampling_leduc_2p(self): np.random.seed(SEED) game = pyspiel.load_game("leduc_poker") os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(10000): os_solver.iteration() conv = exploitability.nash_conv(game, os_solver.average_policy()) print("Leduc2P, conv = {}".format(conv)) self.assertLess(conv, 3.07)
def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv( self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) # TabularPolicy defaults to being a uniform random policy. test_policy = policy.TabularPolicy(game) python_nash_conv = exploitability.nash_conv(game, test_policy) cpp_nash_conv = pyspiel.nash_conv( game, policy_utils.policy_to_dict(test_policy, game)) self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_external_sampling_kuhn_2p_full(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker") es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.FULL) for _ in range(10): es_solver.iteration() conv = exploitability.nash_conv(game, es_solver.average_policy()) print("Kuhn2P, conv = {}".format(conv)) self.assertLess(conv, 1)
def test_cpp_python_cfr_kuhn(self): game = pyspiel.load_game("kuhn_poker") solver = pyspiel.CFRSolver(game) for _ in range(100): solver.evaluate_and_update_policy() pyspiel_average_policy = solver.tabular_average_policy() cpp_nash_conv = pyspiel.nash_conv(game, pyspiel_average_policy) python_policy = policy.pyspiel_policy_to_python_policy( game, pyspiel_average_policy) python_nash_conv = exploitability.nash_conv(game, python_policy) self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_outcome_sampling_kuhn_2p(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker") os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(1000): os_solver.iteration() conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, os_solver.callable_avg_policy())) print("Kuhn2P, conv = {}".format(conv)) self.assertGreater(conv, 0.2) self.assertLess(conv, 0.3)
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": FLAGS.players}) if FLAGS.sampling == "external": cfr_solver = external_mccfr.ExternalSamplingSolver( game, external_mccfr.AverageType.SIMPLE) else: cfr_solver = outcome_mccfr.OutcomeSamplingSolver(game) for i in range(FLAGS.iterations): cfr_solver.iteration() if i % FLAGS.print_freq == 0: conv = exploitability.nash_conv(game, cfr_solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv))
def test_outcome_sampling_leduc_2p(self): np.random.seed(SEED) game = pyspiel.load_game("leduc_poker") os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(1000): os_solver.iteration() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable(game, os_solver.callable_avg_policy())) print("Leduc2P, conv = {}".format(conv)) self.assertGreater(conv, 4.5) self.assertLess(conv, 4.6)
def test_external_sampling_kuhn_2p_simple(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker") es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.SIMPLE) for _ in range(10): es_solver.iteration() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable(game, es_solver.callable_avg_policy())) print("Kuhn2P, conv = {}".format(conv)) self.assertLess(conv, 1)
def main(unused_argv): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) episodes = [] exploits = [] nashes = [] # Train the agent for a specific amount of episodes for ep in range(FLAGS.num_train_episodes): print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes)) cfr_solver.evaluate_and_update_policy() avg_pol = cfr_solver.average_policy() # Calculate the exploitability and nash convergence expl = exploitability.exploitability(game, avg_pol) nash = exploitability.nash_conv(game, avg_pol) exploits.append(expl) nashes.append(nash) episodes.append(ep) # Get the average policy average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) cur_pol = cfr_solver.current_policy() # Plot the exploitability plt.plot(episodes, exploits, "-r", label="Exploitability") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_expl.png") plt.figure() # Plot the nash convergence plt.plot(episodes, nashes, "-r", label="NashConv") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_nash.png") print(average_policy) print(average_policy_values) policy_to_csv(game, average_policy, "./kuhn_policy.csv")
def test_external_sampling_kuhn_3p_full(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker", {"players": pyspiel.GameParameter(3)}) es_solver = external_sampling_mccfr.ExternalSamplingSolver( game, external_sampling_mccfr.AverageType.FULL) for _ in range(10): es_solver.iteration() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable(game, es_solver.callable_avg_policy())) print("Kuhn3P, conv = {}".format(conv)) self.assertLess(conv, 2)
def test_outcome_sampling_kuhn_3p(self): np.random.seed(SEED) game = pyspiel.load_game("kuhn_poker", {"players": pyspiel.GameParameter(3)}) os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game) for _ in range(1000): os_solver.iteration() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable(game, os_solver.callable_avg_policy())) print("Kuhn3P, conv = {}".format(conv)) self.assertGreater(conv, 0.3) self.assertLess(conv, 0.4)
def test_cpp_and_python_cfr_br(self, game, solver_cls, expected_exploitability): solver = solver_cls(game) for step in range(5): solver.evaluate_and_update_policy() # We do not compare the policy directly as we do not have an easy way to # convert one to the other, so we use the exploitability as a proxy. avg_policy = solver.average_policy() if solver_cls == pyspiel.CFRBRSolver: exploitability_ = pyspiel.nash_conv(game, avg_policy) else: exploitability_ = exploitability.nash_conv(game, avg_policy) self.assertEqual(expected_exploitability[step], exploitability_)
def test_cpp_algorithms_identical_to_python_algorithm( self, game, cpp_class, python_class): cpp_solver = cpp_class(game) python_solver = python_class(game) for _ in range(5): cpp_solver.evaluate_and_update_policy() python_solver.evaluate_and_update_policy() cpp_avg_policy = cpp_solver.average_policy() python_avg_policy = python_solver.average_policy() # We do not compare the policy directly as we do not have an easy way to # convert one to the other, so we use the exploitability as a proxy. cpp_expl = pyspiel.nash_conv(game, cpp_avg_policy) python_expl = exploitability.nash_conv(game, python_avg_policy) self.assertEqual(cpp_expl, python_expl) # Then we also check the CurrentPolicy, just to check it is giving the same # results too cpp_current_policy = cpp_solver.current_policy() python_current_policy = python_solver.current_policy() cpp_expl = pyspiel.nash_conv(game, cpp_current_policy) python_expl = exploitability.nash_conv(game, python_current_policy) self.assertEqual(cpp_expl, python_expl)
def external_sampling_monte_carlo_counterfactual_regret_minimization( seq_game, number_of_iterations, compute_metrics=False): cfr_solver = external_mccfr.ExternalSamplingSolver( seq_game, external_mccfr.AverageType.SIMPLE) tick_time = time.time() # print("CFRSolver initialized.") for _ in range(number_of_iterations): cfr_solver.iteration() timing = time.time() - tick_time # print("Finish.") if compute_metrics: nash_conv = exploitability.nash_conv(seq_game, cfr_solver.average_policy()) return timing, cfr_solver.average_policy(), nash_conv return timing, cfr_solver.average_policy()
def gpsro_looper(env, oracle, agents): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector g_psro_solver = psro_v2.PSROSolver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game) start_time = time.time() for gpsro_iteration in range(FLAGS.gpsro_iterations): if FLAGS.verbose: print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) g_psro_solver.iteration() meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) # The following lines only work for sequential games for the moment. if env.game.get_type( ).dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL: aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) _ = print_policy_analysis(policies, env.game, FLAGS.verbose) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print( "Exploitabilities per player : {}".format(expl_per_player))
def test_2p_nash_conv(self): # Note: The first action test_policy is "AlwaysFold". kuhn_poker = pyspiel.load_game("kuhn_poker") leduc_poker = pyspiel.load_game("leduc_poker") test_parameters = [ (kuhn_poker, policy.UniformRandomPolicy(kuhn_poker), 0.9166666666666666), (kuhn_poker, policy.FirstActionPolicy(kuhn_poker), 2.), (kuhn_poker, data.kuhn_nash_equilibrium(alpha=0.2), 0.), (leduc_poker, policy.FirstActionPolicy(leduc_poker), 2.), (leduc_poker, policy.UniformRandomPolicy(leduc_poker), 4.747222222222222), ] for game, test_test_policy, expected_value in test_parameters: self.assertAlmostEqual( exploitability.nash_conv(game, test_test_policy), expected_value)