def xfsp_train(_): exploit_history = list() exploit_idx = list() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) fsp_solver = fictitious_play.XFPSolver(game) checkpoint = datetime.now() for ep in range(FLAGS.episodes): if (ep % 1000) == 0: delta = datetime.now() - checkpoint pol = policy.PolicyFromCallable( game, fsp_solver.average_policy_callable()) conv = exploitability.exploitability(game, pol) exploit_history.append(conv) exploit_idx.append(ep) print( "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint" .format(ep, conv, delta.seconds)) checkpoint = datetime.now() fsp_solver.iteration() agent_name = "xfsp" pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable()) for pid in [1, 2]: policy_to_csv( game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
def test_shapleys_game(self): game = pyspiel.load_game_as_turn_based("matrix_shapleys_game") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv(game, xfp_solver.average_policy()) print("FP in Shapley's Game. Iter: {}, NashConv: {}".format(i, conv))
def test_matching_pennies_3p(self): game = pyspiel.load_game_as_turn_based("matching_pennies_3p") xfp_solver = fictitious_play.XFPSolver(game) for i in range(1000): xfp_solver.iteration() if i % 10 == 0: conv = exploitability.nash_conv(game, xfp_solver.average_policy()) print("FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format( i, conv))
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": FLAGS.players}) xfp_solver = fictitious_play.XFPSolver(game) for i in range(FLAGS.iterations): xfp_solver.iteration() conv = exploitability.exploitability(game, xfp_solver.average_policy()) if i % FLAGS.print_freq == 0: print("Iteration: {} Conv: {}".format(i, conv)) sys.stdout.flush()
def test_meta_game_leduc2p(self): print("Leduc 2p") game = pyspiel.load_game("leduc_poker") xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True) for _ in range(3): xfp_solver.iteration() meta_games = xfp_solver.get_empirical_metagame(10, seed=86487) self.assertIsNotNone(meta_games) # Metagame utility matrices for each player for i in range(2): print("player {}: \n{}".format(i + 1, meta_games[i]))
def test_meta_game_kuhn4p(self): print("Kuhn 4p") game = pyspiel.load_game("kuhn_poker", {"players": 4}) xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True) for _ in range(3): xfp_solver.iteration() meta_games = xfp_solver.get_empirical_metagame(10, seed=1) self.assertIsNotNone(meta_games) # Metagame utility tensors for each player for i in range(4): print("player {}: \n{}".format(i + 1, meta_games[i]))
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False): xfp_solver = fictitious_play.XFPSolver(seq_game) tick_time = time.time() for _ in range(number_of_iterations): xfp_solver.iteration() timing = time.time() - tick_time # print('done') # average_policies = xfp_solver.average_policy_tables() tabular_policy = policy_module.TabularPolicy(seq_game) if compute_metrics: nash_conv = exploitability.nash_conv(seq_game, xfp_solver.average_policy()) average_policy_values = expected_game_score.policy_value( seq_game.new_initial_state(), [tabular_policy]) return timing, tabular_policy, nash_conv, average_policy_values return timing, tabular_policy
def XFP_Solving(game, iterations, save_every=0, save_prefix='base'): def save_xfp(): xfp_policy = xfp_solver.average_policy_tables() policy_keys = np.concatenate((list(xfp_policy[0].keys()), list(xfp_policy[1].keys())), 0) policy_values = np.concatenate((list(map(lambda d: list(d.values()), list(xfp_policy[0].values()))), list(map(lambda d: list(d.values()), list(xfp_policy[1].values())))), 0) # change possible None's into 0 policy_values = [(d if d else 0 for d in a) for a in policy_values] xfp_policy = dict(zip(policy_keys, policy_values)) policy_handler.save_to_tabular_policy(game, xfp_policy, "policies/XFP/{}/{}".format(save_prefix, it)) xfp_solver = fictitious_play.XFPSolver(game) for it in range(iterations + 1): xfp_solver.iteration() if save_every != 0 and it % save_every == 0: # order is important save_xfp() save_xfp()
def get_kuhn_poker_data(num_players=3): """Returns the kuhn poker data for the number of players specified.""" game = pyspiel.load_game('kuhn_poker', {'players': num_players}) xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True) for _ in range(3): xfp_solver.iteration() # Results are seed-dependent, so show some interesting cases if num_players == 2: meta_games = xfp_solver.get_empirical_metagame(100, seed=1) elif num_players == 3: meta_games = xfp_solver.get_empirical_metagame(100, seed=5) elif num_players == 4: meta_games = xfp_solver.get_empirical_metagame(100, seed=2) # Metagame utility matrices for each player payoff_tables = [] for i in range(num_players): payoff_tables.append(meta_games[i]) return payoff_tables
def test_xfp(self): game = pyspiel.load_game("kuhn_poker") xfp_solver = fictitious_play.XFPSolver(game) for _ in range(100): xfp_solver.iteration() average_policies = xfp_solver.average_policy_tables() tabular_policy = policy.TabularPolicy(game) for player_id in range(2): for info_state, state_policy in average_policies[player_id].items(): policy_to_update = tabular_policy.policy_for_key(info_state) for action, probability in state_policy.items(): policy_to_update[action] = probability average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [tabular_policy, tabular_policy]) print("Kuhn 2P average values after 10 iterations") print("P0: {}".format(average_policy_values[0])) print("P1: {}".format(average_policy_values[1])) self.assertIsNotNone(average_policy_values) self.assertTrue( np.allclose(average_policy_values, [-1 / 18, 1 / 18], atol=1e-3))
np.save(save_prefix + '_exps', np.array(exps)) print(f"saving to: {save_prefix + '_episodes.npy'}") np.save(save_prefix + '_episodes', np.array(episodes)) if algorithm == 'cfr': cfr_infostates.append(solver.num_infostates_expanded) print("Num infostates expanded (mil): ", solver.num_infostates_expanded / 1e6) print(f"saving to: {save_prefix + '_infostates.npy'}") np.save(save_prefix + '_infostates', np.array(cfr_infostates)) if algorithm == 'cfr': solver = cfr.CFRSolver(game) run(solver, iterations) elif algorithm == 'xfp': solver = fictitious_play.XFPSolver(game) run(solver, iterations) elif algorithm == 'xdo': brs = [] info_test = [] for i in range(2): br_info = exploitability.best_response( game, cfr.CFRSolver(game).average_policy(), i) full_br_policy = _full_best_response_policy( br_info["best_response_action"]) info_sets = br_info['info_sets'] info_test.append(info_sets) brs.append(full_br_policy) br_list = [brs] start_time = time.time()