def cfr_train(unused_arg):
    exploit_history = list()
    exploit_idx = list()

    tf.enable_eager_execution()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    agent_name = "cfr"
    cfr_solver = cfr.CFRSolver(game)
    checkpoint = datetime.now()
    for ep in range(FLAGS.episodes):
        cfr_solver.evaluate_and_update_policy()
        if ep % 100 == 0:
            delta = datetime.now() - checkpoint
            conv = exploitability.exploitability(game,
                                                 cfr_solver.average_policy())
            exploit_idx.append(ep)
            exploit_history.append(conv)
            print(
                "Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(ep, conv, delta.seconds))
            checkpoint = datetime.now()

    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    now = datetime.now()
    policy = cfr_solver.average_policy()
    agent_name = "cfr"
    for pid in [1, 2]:
        policy_to_csv(
            game, policy,
            f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" +
            agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")
def xfsp_train(_):
    exploit_history = list()
    exploit_idx = list()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    fsp_solver = fictitious_play.XFPSolver(game)
    checkpoint = datetime.now()
    for ep in range(FLAGS.episodes):
        if (ep % 1000) == 0:
            delta = datetime.now() - checkpoint
            pol = policy.PolicyFromCallable(
                game, fsp_solver.average_policy_callable())
            conv = exploitability.exploitability(game, pol)
            exploit_history.append(conv)
            exploit_idx.append(ep)
            print(
                "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(ep, conv, delta.seconds))
            checkpoint = datetime.now()

        fsp_solver.iteration()

    agent_name = "xfsp"
    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable())
    for pid in [1, 2]:
        policy_to_csv(
            game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") +
            "_" + agent_name + "_" + str(pid + 1) + "_+" +
            str(FLAGS.episodes) + "episodes.csv")
def test_tabular_policy_from_csv(tmpdir):
    game = pyspiel.load_game("kuhn_poker")
    output = os.path.join(tmpdir, 'policy.csv')
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    tabular_policy_from_csv(game, output)
def csv_policy(tmpdir):
    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    return output
def run_agents(sess, env, agents, expl_policies_avg):
    agent_name = "nfsp"
    write_policy_at = [1e4, 1e5, 1e6, 3e6, 5e6]
    sess.run(tf.global_variables_initializer())
    exploit_idx = list()
    exploit_history = list()
    for ep in range(FLAGS.episodes):
        if (ep + 1) % 10000 == 0:
            expl = exploitability.exploitability(env.game, expl_policies_avg)
            exploit_idx.append(ep)
            exploit_history.append(expl)
            with open("exploitabilities.txt", "a") as f:
                f.write(str(expl) + "\n")
            losses = [agent.loss for agent in agents]
            msg = "-" * 80 + "\n"
            msg += "{}: {}\n{}\n".format(ep + 1, expl, losses)
            logging.info("%s", msg)

        if ep in write_policy_at:
            for pid, agent in enumerate(agents):
                policy_to_csv(
                    env.game, expl_policies_avg,
                    f"policies/policy_" + agent_name + "_" +
                    datetime.now().strftime("%m-%d-%Y_%H-%M") + "_" +
                    str(pid + 1) + "_" + str(ep) + "episodes.csv")

        time_step = env.reset()
        while not time_step.last():
            player_id = time_stcfr_trainep.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)

    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    now = datetime.now()
    for pid, agent in enumerate(agents):
        policy_to_csv(
            env.game, expl_policies_avg,
            f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" +
            agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")

    plt.plot([i for i in range(len(exploit_history))], exploit_history)
    plt.ylim(0.01, 1)
    plt.yticks([1, 0.1, 0.01])
    plt.yscale("log")
    plt.xscale("log")
    plt.show()
def test_play_tournament(tmpdir):
    game = pyspiel.load_game("kuhn_poker")
    for team in ["python", "ruby", "java"]:
        for player in ["p1", "p2"]:
            tabular_policy = policy.TabularPolicy(game)
            # Save policy as CSV
            output = os.path.join(tmpdir, f'{team}_{player}.csv')
            policy_to_csv(game, tabular_policy, output)
    ranking, results = play_tournament(game, str(tmpdir))
    assert len(list(ranking.keys())) == 3
    assert len(results) == 3 * 2 * 2
def rcfr_train(unused_arg):
    tf.enable_eager_execution()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    models = [
        rcfr.DeepRcfrModel(
            game,
            num_hidden_layers=1,
            num_hidden_units=64 if FLAGS.game == "leduc_poker" else 13,
            num_hidden_factors=1,
            use_skip_connections=True) for _ in range(game.num_players())
    ]
    patient = rcfr.RcfrSolver(game, models, False, True)
    exploit_history = list()
    exploit_idx = list()

    def _train(model, data):
        data = data.shuffle(1000)
        data = data.batch(12)
        #data = data.repeat(1)
        optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True)
        for x, y in data:
            optimizer.minimize(
                lambda: tf.losses.huber_loss(y, model(x)),  # pylint: disable=cell-var-from-loop
                model.trainable_variables)

    agent_name = "rcfr"
    checkpoint = datetime.now()
    for iteration in range(FLAGS.episodes):
        if (iteration % 100) == 0:
            delta = datetime.now() - checkpoint
            conv = pyspiel.exploitability(game, patient.average_policy())
            exploit_idx.append(iteration)
            exploit_history.append(conv)
            print(
                "[RCFR] Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(iteration, conv, delta.seconds))
            checkpoint = datetime.now()
        patient.evaluate_and_update_policy(_train)

    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    now = datetime.now()
    policy = patient.average_policy()

    for pid in [1, 2]:
        policy_to_csv(
            game, policy, f"policies/policy_" +
            now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" +
            str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
Exemple #8
0
def main(unused_argv):
    game = pyspiel.load_game("kuhn_poker")
    cfr_solver = cfr.CFRSolver(game)

    episodes = []
    exploits = []
    nashes = []

    # Train the agent for a specific amount of episodes
    for ep in range(FLAGS.num_train_episodes):
        print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes))
        cfr_solver.evaluate_and_update_policy()
        avg_pol = cfr_solver.average_policy()

        # Calculate the exploitability and nash convergence
        expl = exploitability.exploitability(game, avg_pol)
        nash = exploitability.nash_conv(game, avg_pol)

        exploits.append(expl)
        nashes.append(nash)
        episodes.append(ep)

    # Get the average policy
    average_policy = cfr_solver.average_policy()
    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    cur_pol = cfr_solver.current_policy()

    # Plot the exploitability
    plt.plot(episodes, exploits, "-r", label="Exploitability")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_expl.png")

    plt.figure()

    # Plot the nash convergence
    plt.plot(episodes, nashes, "-r", label="NashConv")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_nash.png")

    print(average_policy)
    print(average_policy_values)
    policy_to_csv(game, average_policy, "./kuhn_policy.csv")
def neurd_train(unudes_arg):
    tf.enable_eager_execution()

    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})

    models = []
    for _ in range(game.num_players()):
        models.append(
            neurd.DeepNeurdModel(game,
                                 num_hidden_layers=1,
                                 num_hidden_units=13,
                                 num_hidden_factors=8,
                                 use_skip_connections=True,
                                 autoencode=False))
    solver = neurd.CounterfactualNeurdSolver(game, models)

    def _train(model, data):
        neurd.train(model,
                    data,
                    batch_size=100,
                    step_size=1,
                    threshold=2,
                    autoencoder_loss=(None))

    exploit_history = list()
    for ep in range(FLAGS.episodes):
        solver.evaluate_and_update_policy(_train)
        if ep % 100 == 0:
            conv = pyspiel.exploitability(game, solver.average_policy())
            exploit_history.append(conv)
            print("Iteration {} exploitability {}".format(ep, conv))

    now = datetime.now()
    policy = solver.average_policy()
    agent_name = "neurd"
    for pid in [1, 2]:
        policy_to_csv(
            game, policy,
            f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" +
            agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")

    plt.plot([i for i in range(len(exploit_history))], exploit_history)
    plt.ylim(0.01, 1)
    plt.yticks([1, 0.1, 0.01])
    plt.yscale("log")
    plt.xscale("log")
    plt.show()
def main(_):
    game = "kuhn_poker"
    num_players = 2

    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(sess,
                                           idx,
                                           info_state_size,
                                           num_actions,
                                           loss_str=FLAGS.loss_str,
                                           hidden_layers_sizes=(128, ))
            for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(env, agents)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_episodes):

            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                expl = exploitability.exploitability(env.game,
                                                     expl_policies_avg)
                msg = "-" * 80 + "\n"
                msg += "{}: {}\n{}\n".format(ep + 1, expl, losses)
                logging.info("%s", msg)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        for pid, agent in enumerate(agents):
            policy_to_csv(env.game, expl_policies_avg,
                          f"{FLAGS.modeldir}/test_p{pid+1}.csv")
def test_tabular_policy_to_csv(tmpdir):
    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    tabular_policy = policy.TabularPolicy(game)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, tabular_policy, output)
    assert list(tmpdir.listdir()) == [output]
    # Check created CSV
    csv = pd.read_csv(output, index_col=0)
    # Get all states in the game at which players have to make decisions.
    states = get_all_states.get_all_states(game,
                                           depth_limit=-1,
                                           include_terminals=False,
                                           include_chance_states=False)
    assert set(csv.index.values) <= set(states.keys())
    assert len(csv.columns) == game.num_distinct_actions()
def test_callable_policy_to_csv(tmpdir):
    def _uniform_policy(state):
        actions = state.legal_actions()
        p = 1.0 / len(actions)
        return [(a, p) for a in actions]

    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    callable_policy = policy.PolicyFromCallable(game, _uniform_policy)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, callable_policy, output)
    assert list(tmpdir.listdir()) == [output]
    # Check created CSV
    csv = pd.read_csv(output, index_col=0)
    # Get all states in the game at which players have to make decisions.
    states = get_all_states.get_all_states(game,
                                           depth_limit=-1,
                                           include_terminals=False,
                                           include_chance_states=False)
    assert set(csv.index.values) <= set(states.keys())
Exemple #13
0
def runNFSP(hidden_layers_sizes, replay_buffer_capacity,
            reservoir_buffer_capacity, epsilon_start, epsilon_end,
            anticipatory_param):
    # Define data storage arrays
    episodes = []
    exploits = []

    # Initialize the game
    game = FLAGS.game
    num_players = FLAGS.num_players

    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    kwargs = {
        "replay_buffer_capacity": replay_buffer_capacity,
        "epsilon_decay_duration": FLAGS.num_train_episodes,
        "epsilon_start": epsilon_start,
        "epsilon_end": epsilon_end,
    }

    # Start the TensorFlow session
    with tf.Session() as sess:
        # Initialize NFSP Agent
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions,
                      hidden_layers_sizes, reservoir_buffer_capacity,
                      anticipatory_param, **kwargs)
            for idx in range(num_players)
        ]
        expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_train_episodes):
            # Evaluate Agents
            if ((ep + 1) % FLAGS.eval_every == 0) & ((ep + 1) >= 100):
                losses = [agent.loss for agent in agents]
                logging.info("Losses: %s", losses)
                expl = exploitability.exploitability(env.game,
                                                     expl_policies_avg)
                logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
                logging.info("_____________________________________________")

                episodes.append(ep + 1)
                exploits.append(expl)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        for pid, agent in enumerate(agents):
            policy_to_csv(env.game, expl_policies_avg,
                          f"{FLAGS.modeldir}/test_p{pid+1}.csv")
        play(agents, env)

    return episodes, exploits