def cfr_train(unused_arg): exploit_history = list() exploit_idx = list() tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) agent_name = "cfr" cfr_solver = cfr.CFRSolver(game) checkpoint = datetime.now() for ep in range(FLAGS.episodes): cfr_solver.evaluate_and_update_policy() if ep % 100 == 0: delta = datetime.now() - checkpoint conv = exploitability.exploitability(game, cfr_solver.average_policy()) exploit_idx.append(ep) exploit_history.append(conv) print( "Iteration {} exploitability {} - {} seconds since last checkpoint" .format(ep, conv, delta.seconds)) checkpoint = datetime.now() pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) now = datetime.now() policy = cfr_solver.average_policy() agent_name = "cfr" for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")
def xfsp_train(_): exploit_history = list() exploit_idx = list() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) fsp_solver = fictitious_play.XFPSolver(game) checkpoint = datetime.now() for ep in range(FLAGS.episodes): if (ep % 1000) == 0: delta = datetime.now() - checkpoint pol = policy.PolicyFromCallable( game, fsp_solver.average_policy_callable()) conv = exploitability.exploitability(game, pol) exploit_history.append(conv) exploit_idx.append(ep) print( "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint" .format(ep, conv, delta.seconds)) checkpoint = datetime.now() fsp_solver.iteration() agent_name = "xfsp" pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable()) for pid in [1, 2]: policy_to_csv( game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
def test_tabular_policy_from_csv(tmpdir): game = pyspiel.load_game("kuhn_poker") output = os.path.join(tmpdir, 'policy.csv') tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) tabular_policy_from_csv(game, output)
def csv_policy(tmpdir): # Setup game and policy game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) return output
def run_agents(sess, env, agents, expl_policies_avg): agent_name = "nfsp" write_policy_at = [1e4, 1e5, 1e6, 3e6, 5e6] sess.run(tf.global_variables_initializer()) exploit_idx = list() exploit_history = list() for ep in range(FLAGS.episodes): if (ep + 1) % 10000 == 0: expl = exploitability.exploitability(env.game, expl_policies_avg) exploit_idx.append(ep) exploit_history.append(expl) with open("exploitabilities.txt", "a") as f: f.write(str(expl) + "\n") losses = [agent.loss for agent in agents] msg = "-" * 80 + "\n" msg += "{}: {}\n{}\n".format(ep + 1, expl, losses) logging.info("%s", msg) if ep in write_policy_at: for pid, agent in enumerate(agents): policy_to_csv( env.game, expl_policies_avg, f"policies/policy_" + agent_name + "_" + datetime.now().strftime("%m-%d-%Y_%H-%M") + "_" + str(pid + 1) + "_" + str(ep) + "episodes.csv") time_step = env.reset() while not time_step.last(): player_id = time_stcfr_trainep.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) now = datetime.now() for pid, agent in enumerate(agents): policy_to_csv( env.game, expl_policies_avg, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv") plt.plot([i for i in range(len(exploit_history))], exploit_history) plt.ylim(0.01, 1) plt.yticks([1, 0.1, 0.01]) plt.yscale("log") plt.xscale("log") plt.show()
def test_play_tournament(tmpdir): game = pyspiel.load_game("kuhn_poker") for team in ["python", "ruby", "java"]: for player in ["p1", "p2"]: tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, f'{team}_{player}.csv') policy_to_csv(game, tabular_policy, output) ranking, results = play_tournament(game, str(tmpdir)) assert len(list(ranking.keys())) == 3 assert len(results) == 3 * 2 * 2
def rcfr_train(unused_arg): tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) models = [ rcfr.DeepRcfrModel( game, num_hidden_layers=1, num_hidden_units=64 if FLAGS.game == "leduc_poker" else 13, num_hidden_factors=1, use_skip_connections=True) for _ in range(game.num_players()) ] patient = rcfr.RcfrSolver(game, models, False, True) exploit_history = list() exploit_idx = list() def _train(model, data): data = data.shuffle(1000) data = data.batch(12) #data = data.repeat(1) optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True) for x, y in data: optimizer.minimize( lambda: tf.losses.huber_loss(y, model(x)), # pylint: disable=cell-var-from-loop model.trainable_variables) agent_name = "rcfr" checkpoint = datetime.now() for iteration in range(FLAGS.episodes): if (iteration % 100) == 0: delta = datetime.now() - checkpoint conv = pyspiel.exploitability(game, patient.average_policy()) exploit_idx.append(iteration) exploit_history.append(conv) print( "[RCFR] Iteration {} exploitability {} - {} seconds since last checkpoint" .format(iteration, conv, delta.seconds)) checkpoint = datetime.now() patient.evaluate_and_update_policy(_train) pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) now = datetime.now() policy = patient.average_policy() for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
def main(unused_argv): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) episodes = [] exploits = [] nashes = [] # Train the agent for a specific amount of episodes for ep in range(FLAGS.num_train_episodes): print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes)) cfr_solver.evaluate_and_update_policy() avg_pol = cfr_solver.average_policy() # Calculate the exploitability and nash convergence expl = exploitability.exploitability(game, avg_pol) nash = exploitability.nash_conv(game, avg_pol) exploits.append(expl) nashes.append(nash) episodes.append(ep) # Get the average policy average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) cur_pol = cfr_solver.current_policy() # Plot the exploitability plt.plot(episodes, exploits, "-r", label="Exploitability") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_expl.png") plt.figure() # Plot the nash convergence plt.plot(episodes, nashes, "-r", label="NashConv") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_nash.png") print(average_policy) print(average_policy_values) policy_to_csv(game, average_policy, "./kuhn_policy.csv")
def neurd_train(unudes_arg): tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) models = [] for _ in range(game.num_players()): models.append( neurd.DeepNeurdModel(game, num_hidden_layers=1, num_hidden_units=13, num_hidden_factors=8, use_skip_connections=True, autoencode=False)) solver = neurd.CounterfactualNeurdSolver(game, models) def _train(model, data): neurd.train(model, data, batch_size=100, step_size=1, threshold=2, autoencoder_loss=(None)) exploit_history = list() for ep in range(FLAGS.episodes): solver.evaluate_and_update_policy(_train) if ep % 100 == 0: conv = pyspiel.exploitability(game, solver.average_policy()) exploit_history.append(conv) print("Iteration {} exploitability {}".format(ep, conv)) now = datetime.now() policy = solver.average_policy() agent_name = "neurd" for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv") plt.plot([i for i in range(len(exploit_history))], exploit_history) plt.ylim(0.01, 1) plt.yticks([1, 0.1, 0.01]) plt.yscale("log") plt.xscale("log") plt.show()
def main(_): game = "kuhn_poker" num_players = 2 env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient(sess, idx, info_state_size, num_actions, loss_str=FLAGS.loss_str, hidden_layers_sizes=(128, )) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(env, agents) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] expl = exploitability.exploitability(env.game, expl_policies_avg) msg = "-" * 80 + "\n" msg += "{}: {}\n{}\n".format(ep + 1, expl, losses) logging.info("%s", msg) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) for pid, agent in enumerate(agents): policy_to_csv(env.game, expl_policies_avg, f"{FLAGS.modeldir}/test_p{pid+1}.csv")
def test_tabular_policy_to_csv(tmpdir): # Setup game and policy game = pyspiel.load_game("kuhn_poker") tabular_policy = policy.TabularPolicy(game) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, tabular_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys()) assert len(csv.columns) == game.num_distinct_actions()
def test_callable_policy_to_csv(tmpdir): def _uniform_policy(state): actions = state.legal_actions() p = 1.0 / len(actions) return [(a, p) for a in actions] # Setup game and policy game = pyspiel.load_game("kuhn_poker") callable_policy = policy.PolicyFromCallable(game, _uniform_policy) # Save policy as CSV output = os.path.join(tmpdir, 'policy.csv') policy_to_csv(game, callable_policy, output) assert list(tmpdir.listdir()) == [output] # Check created CSV csv = pd.read_csv(output, index_col=0) # Get all states in the game at which players have to make decisions. states = get_all_states.get_all_states(game, depth_limit=-1, include_terminals=False, include_chance_states=False) assert set(csv.index.values) <= set(states.keys())
def runNFSP(hidden_layers_sizes, replay_buffer_capacity, reservoir_buffer_capacity, epsilon_start, epsilon_end, anticipatory_param): # Define data storage arrays episodes = [] exploits = [] # Initialize the game game = FLAGS.game num_players = FLAGS.num_players env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] kwargs = { "replay_buffer_capacity": replay_buffer_capacity, "epsilon_decay_duration": FLAGS.num_train_episodes, "epsilon_start": epsilon_start, "epsilon_end": epsilon_end, } # Start the TensorFlow session with tf.Session() as sess: # Initialize NFSP Agent agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, reservoir_buffer_capacity, anticipatory_param, **kwargs) for idx in range(num_players) ] expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): # Evaluate Agents if ((ep + 1) % FLAGS.eval_every == 0) & ((ep + 1) >= 100): losses = [agent.loss for agent in agents] logging.info("Losses: %s", losses) expl = exploitability.exploitability(env.game, expl_policies_avg) logging.info("[%s] Exploitability AVG %s", ep + 1, expl) logging.info("_____________________________________________") episodes.append(ep + 1) exploits.append(expl) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) for pid, agent in enumerate(agents): policy_to_csv(env.game, expl_policies_avg, f"{FLAGS.modeldir}/test_p{pid+1}.csv") play(agents, env) return episodes, exploits