def compute_regret_policy_against_pure_policy_sim_game(game, policy, compute_true_value=False, num_sample=100): time_tick = time.time() if compute_true_value: expected_value_policy = expected_game_score.policy_value( game.new_initial_state(), policy)[0] else: expected_value_policy = get_expected_value_sim_game( game, policy, num_sample) worse_regret = 0 policies = [ PathBCEResponse(game, policy, 0), PathBCDEResponse(game, policy, 0), PathBDEResponse(game, policy, 0) ] for deviation_policy in policies: if compute_true_value: expected_value_noise = expected_game_score.policy_value( game.new_initial_state(), deviation_policy)[0] else: expected_value_noise = get_expected_value_sim_game( game, deviation_policy, num_sample, player=0) approximate_regret = expected_value_noise - expected_value_policy worse_regret = max(worse_regret, approximate_regret) return worse_regret, time.time() - time_tick
def test_learning_and_applying_mfg_policy_in_n_player_game(self): """Test converting learnt MFG policy default game.""" # learning the Braess MFG Nash equilibrium mfg_game = pyspiel.load_game("python_mfg_dynamic_routing") omd = mirror_descent.MirrorDescent(mfg_game, lr=1) for _ in range(10): omd.iteration() mfg_policy = omd.get_policy() n_player_game = pyspiel.load_game("python_dynamic_routing") mfg_derived_policy = (dynamic_routing_to_mean_field_game. DerivedNPlayerPolicyFromMeanFieldPolicy( n_player_game, mfg_policy)) expected_game_score.policy_value(n_player_game.new_initial_state(), mfg_derived_policy)
def test_expected_game_score_uniform_random_iterated_prisoner_dilemma(self): game = pyspiel.load_game( "python_iterated_prisoners_dilemma(max_game_length=6)") pi = policy.UniformRandomPolicy(game) values = expected_game_score.policy_value(game.new_initial_state(), pi) # 4*(1-0.875**6)/0.125 = 17.6385498 np.testing.assert_allclose(values, [17.6385498, 17.6385498])
def print_average_payouts(): # Print the average payouts given the current game and the final policies game = pyspiel.load_game(FLAGS.game_name) average_policy = __tabular_policy_from_csv(game, "./leduc_best_policy.csv") average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print(average_policy_values)
def test_discounted_cfr_on_kuhn(self): game = pyspiel.load_game("kuhn_poker") solver = discounted_cfr.DCFRSolver(game) for _ in range(300): solver.evaluate_and_update_policy() average_policy = solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose(average_policy_values, [-1 / 18, 1 / 18], atol=1e-3)
def test_uniform_mfg_policy_conversion_to_n_player_uniform_policy(self): """Test conversion of uniform to uniform policy.""" mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) n_player_game = pyspiel.load_game("python_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) mfg_derived_policy = (dynamic_routing_to_mean_field_game. DerivedNPlayerPolicyFromMeanFieldPolicy( n_player_game, policy.UniformRandomPolicy(mfg_game))) derived_policy_value = expected_game_score.policy_value( n_player_game.new_initial_state(), mfg_derived_policy) uniform_policy_value = expected_game_score.policy_value( n_player_game.new_initial_state(), policy.UniformRandomPolicy(n_player_game)) self.assertSequenceAlmostEqual(derived_policy_value, uniform_policy_value)
def main(unused_argv): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) episodes = [] exploits = [] nashes = [] # Train the agent for a specific amount of episodes for ep in range(FLAGS.num_train_episodes): print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes)) cfr_solver.evaluate_and_update_policy() avg_pol = cfr_solver.average_policy() # Calculate the exploitability and nash convergence expl = exploitability.exploitability(game, avg_pol) nash = exploitability.nash_conv(game, avg_pol) exploits.append(expl) nashes.append(nash) episodes.append(ep) # Get the average policy average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) cur_pol = cfr_solver.current_policy() # Plot the exploitability plt.plot(episodes, exploits, "-r", label="Exploitability") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_expl.png") plt.figure() # Plot the nash convergence plt.plot(episodes, nashes, "-r", label="NashConv") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_nash.png") print(average_policy) print(average_policy_values) policy_to_csv(game, average_policy, "./kuhn_policy.csv")
def main(_): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) iterations = 1000 for i in range(iterations): cfr_value = cfr_solver.evaluate_and_update_policy() print("Game util at iteration {}: {}".format(i, cfr_value)) average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Expected player 0 value: {}".format(-1 / 18))
def test_cfr_kuhn_poker_runs_with_multiple_players(self, linear_averaging, regret_matching_plus, alternating_updates): num_players = 3 game = pyspiel.load_game("kuhn_poker", {"players": num_players}) cfr_solver = cfr._CFRSolver(game, regret_matching_plus=regret_matching_plus, linear_averaging=linear_averaging, alternating_updates=alternating_updates) for _ in range(10): cfr_solver.evaluate_and_update_policy() average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * num_players) del average_policy_values
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False): xfp_solver = fictitious_play.XFPSolver(seq_game) tick_time = time.time() for _ in range(number_of_iterations): xfp_solver.iteration() timing = time.time() - tick_time # print('done') # average_policies = xfp_solver.average_policy_tables() tabular_policy = policy_module.TabularPolicy(seq_game) if compute_metrics: nash_conv = exploitability.nash_conv(seq_game, xfp_solver.average_policy()) average_policy_values = expected_game_score.policy_value( seq_game.new_initial_state(), [tabular_policy]) return timing, tabular_policy, nash_conv, average_policy_values return timing, tabular_policy
def test_best_response_tic_tac_toe_value_is_consistent(self): # This test was failing because of use of str(state) in the best response, # which is imperfect recall. We now use state.history_str() throughout. # Chose a policy at random; not the uniform random policy. game = pyspiel.load_game("tic_tac_toe") pi = policy.TabularPolicy(game) rng = np.random.RandomState(1234) pi.action_probability_array[:] = rng.rand(*pi.legal_actions_mask.shape) pi.action_probability_array *= pi.legal_actions_mask pi.action_probability_array /= np.sum( pi.action_probability_array, axis=1, keepdims=True) # Compute a best response and verify the best response value is consistent. br = best_response.BestResponsePolicy(game, 1, pi) self.assertAlmostEqual( expected_game_score.policy_value(game.new_initial_state(), [pi, br])[1], br.value(game.new_initial_state()))
def test_xfp(self): game = pyspiel.load_game("kuhn_poker") xfp_solver = fictitious_play.XFPSolver(game) for _ in range(100): xfp_solver.iteration() average_policies = xfp_solver.average_policy_tables() tabular_policy = policy.TabularPolicy(game) for player_id in range(2): for info_state, state_policy in average_policies[player_id].items(): policy_to_update = tabular_policy.policy_for_key(info_state) for action, probability in state_policy.items(): policy_to_update[action] = probability average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [tabular_policy, tabular_policy]) print("Kuhn 2P average values after 10 iterations") print("P0: {}".format(average_policy_values[0])) print("P1: {}".format(average_policy_values[1])) self.assertIsNotNone(average_policy_values) self.assertTrue( np.allclose(average_policy_values, [-1 / 18, 1 / 18], atol=1e-3))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(16, ), advantage_network_layers=(16, ), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=128, batch_size_strategy=1024, memory_capacity=1e7, policy_network_train_steps=400, advantage_network_train_steps=20, reinitialize_advantage_networks=False) sess.run(tf.global_variables_initializer()) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) conv = exploitability.nash_conv(game, average_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Expected player 0 value: {}".format(-1 / 18)) print("Computed player 1 value: {}".format(average_policy_values[1])) print("Expected player 1 value: {}".format(1 / 18))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) deep_cfr_solver = deep_cfr_tf2.DeepCFRSolver( game, policy_network_layers=(64, 64, 64, 64), advantage_network_layers=(64, 64, 64, 64), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=2048, batch_size_strategy=2048, memory_capacity=1e6, policy_network_train_steps=5000, advantage_network_train_steps=500, reinitialize_advantage_networks=True, infer_device="cpu", train_device="cpu") _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) conv = exploitability.nash_conv(game, average_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Computed player 1 value: {}".format(average_policy_values[1]))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) deep_cfr_solver = deep_cfr.DeepCFRSolver( game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=int(1e7)) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy) conv = pyspiel.nash_conv(game, pyspiel_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) logging.info("Computed player 0 value: %.2f (expected: %.2f).", average_policy_values[0], -1 / 18) logging.info("Computed player 1 value: %.2f (expected: %.2f).", average_policy_values[1], 1 / 18)
def test_braess_paradox(self): """Test that Braess paradox can be reproduced with the mean field game.""" num_player = 8 braess_network = dynamic_routing_utils.Network( { "O": "A", "A": ["B", "C"], "B": ["C", "D"], "C": ["D"], "D": ["E"], "E": [] }, node_position={ "O": (0, 0), "A": (1, 0), "B": (2, 1), "C": (2, -1), "D": (3, 0), "E": (4, 0) }, bpr_a_coefficient={ "O->A": 0, "A->B": 1.0, "A->C": 0, "B->C": 0, "B->D": 0, "C->D": 1.0, "D->E": 0 }, bpr_b_coefficient={ "O->A": 1.0, "A->B": 1.0, "A->C": 1.0, "B->C": 1.0, "B->D": 1.0, "C->D": 1.0, "D->E": 1.0 }, capacity={ "O->A": num_player, "A->B": num_player, "A->C": num_player, "B->C": num_player, "B->D": num_player, "C->D": num_player, "D->E": num_player }, free_flow_travel_time={ "O->A": 0, "A->B": 1.0, "A->C": 2.0, "B->C": 0.25, "B->D": 2.0, "C->D": 1.0, "D->E": 0 }) demand = [ dynamic_routing_utils.Vehicle("O->A", "D->E") for _ in range(num_player) ] game = dynamic_routing.DynamicRoutingGame( {"time_step_length": 0.125, "max_num_time_step": 40}, network=braess_network, vehicles=demand) class TruePathPolicy(policy.Policy): def __init__(self, game): super().__init__(game, list(range(num_player))) self._path = {} def action_probabilities(self, state, player_id=None): assert player_id is not None legal_actions = state.legal_actions(player_id) if not legal_actions: return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0} elif len(legal_actions) == 1: return {legal_actions[0]: 1.0} else: if legal_actions[0] == 2: if self._path[player_id] in ["top", "middle"]: return {2: 1.0} elif self._path[player_id] == "bottom": return {3: 1.0} else: raise ValueError() elif legal_actions[0] == 4: if self._path[player_id] == "top": return {5: 1.0} elif self._path[player_id] == "middle": return {4: 1.0} else: raise ValueError() raise ValueError(f"{legal_actions} is not correct.") class NashEquilibriumBraess(TruePathPolicy): def __init__(self, game): super().__init__(game) for player_id in range(num_player): if player_id % 2 == 0: self._path[player_id] = "middle" if player_id % 4 == 1: self._path[player_id] = "top" if player_id % 4 == 3: self._path[player_id] = "bottom" class SocialOptimumBraess(NashEquilibriumBraess): def __init__(self, game): super().__init__(game) for player_id in range(num_player): if player_id % 2 == 0: self._path[player_id] = "top" if player_id % 2 == 1: self._path[player_id] = "bottom" ne_policy = NashEquilibriumBraess(game) # TODO(cabannes): debug issue with nash conv computation and uncomment the # following line. # self.assertEqual(exploitability.nash_conv(game, ne_policy), 0.0) self.assertSequenceAlmostEqual( -expected_game_score.policy_value(game.new_initial_state(), ne_policy), [3.75] * num_player) so_policy = SocialOptimumBraess(game) # TODO(cabannes): debug issue with nash conv computation and uncomment the # following line. # self.assertEqual(exploitability.nash_conv(game, so_policy), 0.125) self.assertSequenceAlmostEqual( -expected_game_score.policy_value(game.new_initial_state(), so_policy), [3.5] * num_player)
def test_expected_game_score_uniform_random_kuhn_poker(self): game = pyspiel.load_game("kuhn_poker") uniform_policy = policy.UniformRandomPolicy(game) uniform_policy_values = expected_game_score.policy_value( game.new_initial_state(), [uniform_policy] * 2) self.assertTrue(np.allclose(uniform_policy_values, [1 / 8, -1 / 8]))
def neural_ficticious_self_play(seq_game, num_epoch, sess, compute_metrics=False): env = rl_environment.Environment(seq_game) # Parameters from the game. num_players = env.num_players num_actions = env.action_spec()["num_actions"] info_state_size = env.observation_spec()["info_state"][0] # Parameters for the algorithm. hidden_layers_sizes = [int(l) for l in [128]] kwargs = { "replay_buffer_capacity": int(2e5), "reservoir_buffer_capacity": int(2e6), "min_buffer_size_to_learn": 1000, "anticipatory_param": 0.1, "batch_size": 128, "learn_every": 64, "rl_learning_rate": 0.01, "sl_learning_rate": 0.01, "optimizer_str": "sgd", "loss_str": "mse", "update_target_network_every": 19200, "discount_factor": 1.0, "epsilon_decay_duration": int(20e6), "epsilon_start": 0.06, "epsilon_end": 0.001, } # freq_epoch_printing = num_epoch // 10 agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) # print("TF initialized.") tick_time = time.time() for _ in range(num_epoch): # if ep % freq_epoch_printing == 0: # print(f"Iteration {ep}") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) timing = time.time() - tick_time # print("Finish.") if compute_metrics: tabular_policy = joint_avg_policy.TabularPolicy(seq_game) average_policy_values = expected_game_score.policy_value( seq_game.new_initial_state(), [tabular_policy]) nash_conv = exploitability.nash_conv(env.game, joint_avg_policy) return timing, joint_avg_policy, average_policy_values, nash_conv return timing, joint_avg_policy