Ejemplo n.º 1
0
 def test_cfr_cce_dist_goofspiel(self):
     """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc."""
     game = pyspiel.load_game(
         "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order="
         "descending,returns_type=total_points))")
     for num_iterations in [1, 10, 100]:
         policies = []
         cfr_solver = cfr.CFRSolver(game)
         for _ in range(num_iterations):
             cfr_solver.evaluate_and_update_policy()
             policies.append(
                 policy.python_policy_to_pyspiel_policy(
                     cfr_solver.current_policy()))
         mu = pyspiel.uniform_correlation_device(policies)
         cce_dist1 = pyspiel.cce_dist(game, mu)
         print(
             "goofspiel, cce test num_iterations: {}, cce_dist: {}".format(
                 num_iterations, cce_dist1))
         # Assemble the same correlation device manually, just as an example for
         # how to do non-uniform distributions of them and to test the python
         # bindings for lists of tuples works properly
         uniform_prob = 1.0 / len(policies)
         mu2 = [(uniform_prob, policy) for policy in policies]
         cce_dist2 = pyspiel.cce_dist(game, mu2)
         self.assertAlmostEqual(cce_dist1, cce_dist2)
Ejemplo n.º 2
0
def cfr_train(unused_arg):
    exploit_history = list()
    exploit_idx = list()

    tf.enable_eager_execution()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    agent_name = "cfr"
    cfr_solver = cfr.CFRSolver(game)
    checkpoint = datetime.now()
    for ep in range(FLAGS.episodes):
        cfr_solver.evaluate_and_update_policy()
        if ep % 100 == 0:
            delta = datetime.now() - checkpoint
            conv = exploitability.exploitability(game,
                                                 cfr_solver.average_policy())
            exploit_idx.append(ep)
            exploit_history.append(conv)
            print(
                "Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(ep, conv, delta.seconds))
            checkpoint = datetime.now()

    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    now = datetime.now()
    policy = cfr_solver.average_policy()
    agent_name = "cfr"
    for pid in [1, 2]:
        policy_to_csv(
            game, policy,
            f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" +
            agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")
Ejemplo n.º 3
0
def main(_):
    game = pyspiel.load_game_as_turn_based(game_, )
    cfr_solver = cfr.CFRSolver(game)

    print("policy_initial:",
          cfr_solver.current_policy().action_probability_array)
    for i in range(FLAGS.iterations):

        if i % FLAGS.print_freq == 0:
            conv = exploitability.exploitability(game,
                                                 cfr_solver.average_policy())
            print("Iteration {} exploitability {}".format(i, conv))
            print("Iteration{}".format(i))
            print("policy_av:",
                  cfr_solver.average_policy().action_probability_array)
            print("policy_cr:",
                  cfr_solver.current_policy().action_probability_array)

        cfr_solver.evaluate_and_update_policy()
        write_csv(dir_ + game_ + "_" + algo_name + "_av.csv",
                  cfr_solver.average_policy().action_probability_array[0])
        write_csv(dir_ + game_ + "_" + algo_name + "_av.csv",
                  cfr_solver.average_policy().action_probability_array[1])
        write_csv(dir_ + game_ + "_" + algo_name + "_cr.csv",
                  cfr_solver.current_policy().action_probability_array[0])
        write_csv(dir_ + game_ + "_" + algo_name + "_cr.csv",
                  cfr_solver.current_policy().action_probability_array[1])
Ejemplo n.º 4
0
 def test_cfr_on_turn_based_game_with_exploitability(self):
   """Check if CFR can be applied to the sequential game."""
   game = pyspiel.load_game(
       "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)")
   seq_game = pyspiel.convert_to_turn_based(game)
   cfr_solver = cfr.CFRSolver(seq_game)
   for _ in range(_NUM_ITERATION_CFR_TEST):
     cfr_solver.evaluate_and_update_policy()
   exploitability.nash_conv(seq_game, cfr_solver.average_policy())
Ejemplo n.º 5
0
def main(_):
  game = pyspiel.load_game(FLAGS.game,
                           {"players": pyspiel.GameParameter(FLAGS.players)})
  cfr_solver = cfr.CFRSolver(game)

  for i in range(FLAGS.iterations):
    cfr_solver.evaluate_and_update_policy()
    if i % FLAGS.print_freq == 0:
      conv = exploitability.exploitability(game, cfr_solver.average_policy())
      print("Iteration {} exploitability {}".format(i, conv))
Ejemplo n.º 6
0
 def test_cfr_kuhn_poker(self):
   game = pyspiel.load_game("kuhn_poker")
   cfr_solver = cfr.CFRSolver(game)
   for _ in range(300):
     cfr_solver.evaluate_and_update_policy()
   average_policy = cfr_solver.average_policy()
   average_policy_values = expected_game_score.policy_value(
       game.new_initial_state(), [average_policy] * 2)
   # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker
   np.testing.assert_allclose(
       average_policy_values, [-1 / 18, 1 / 18], atol=1e-3)
Ejemplo n.º 7
0
def CFR_Solving(game, iterations, save_every=0, save_prefix='base'):
    def save_cfr():
        policy = cfr_solver.average_policy()
        policy = dict(zip(policy.state_lookup, policy.action_probability_array))
        policy_handler.save_to_tabular_policy(game, policy, "policies/CFR/{}/{}".format(save_prefix, it))

    cfr_solver = cfr.CFRSolver(game)
    for it in range(iterations + 1):
        if save_every != 0 and it % save_every == 0:  # order is important
            save_cfr()
        cfr_solver.evaluate_and_update_policy()
    save_cfr()
Ejemplo n.º 8
0
def main(unused_argv):
    game = pyspiel.load_game("kuhn_poker")
    cfr_solver = cfr.CFRSolver(game)

    episodes = []
    exploits = []
    nashes = []

    # Train the agent for a specific amount of episodes
    for ep in range(FLAGS.num_train_episodes):
        print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes))
        cfr_solver.evaluate_and_update_policy()
        avg_pol = cfr_solver.average_policy()

        # Calculate the exploitability and nash convergence
        expl = exploitability.exploitability(game, avg_pol)
        nash = exploitability.nash_conv(game, avg_pol)

        exploits.append(expl)
        nashes.append(nash)
        episodes.append(ep)

    # Get the average policy
    average_policy = cfr_solver.average_policy()
    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    cur_pol = cfr_solver.current_policy()

    # Plot the exploitability
    plt.plot(episodes, exploits, "-r", label="Exploitability")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_expl.png")

    plt.figure()

    # Plot the nash convergence
    plt.plot(episodes, nashes, "-r", label="NashConv")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_nash.png")

    print(average_policy)
    print(average_policy_values)
    policy_to_csv(game, average_policy, "./kuhn_policy.csv")
Ejemplo n.º 9
0
def main(_):
  game = pyspiel.load_game("kuhn_poker")

  cfr_solver = cfr.CFRSolver(game)
  iterations = 1000

  for i in range(iterations):
    cfr_value = cfr_solver.evaluate_and_update_policy()
    print("Game util at iteration {}: {}".format(i, cfr_value))

  average_policy = cfr_solver.average_policy()
  average_policy_values = expected_game_score.policy_value(
      game.new_initial_state(), [average_policy] * 2)
  print("Computed player 0 value: {}".format(average_policy_values[0]))
  print("Expected player 0 value: {}".format(-1 / 18))
Ejemplo n.º 10
0
    def test_cfr_cce_ce_dist_goofspiel(self):
        """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc."""
        game = pyspiel.load_game(
            "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order="
            "descending,returns_type=total_points))")
        for num_iterations in [1, 10, 100]:
            policies = []
            cfr_solver = cfr.CFRSolver(game)
            for _ in range(num_iterations):
                cfr_solver.evaluate_and_update_policy()
                policies.append(
                    policy.python_policy_to_pyspiel_policy(
                        cfr_solver.current_policy()))
            mu = pyspiel.uniform_correlation_device(policies)
            cce_dist_info = pyspiel.cce_dist(game, mu)
            print(
                "goofspiel, cce test num_iters: {}, cce_dist: {}, per player: {}"
                .format(num_iterations, cce_dist_info.dist_value,
                        cce_dist_info.deviation_incentives))
            # Try converting one of the BR policies:
            _ = policy.pyspiel_policy_to_python_policy(
                game, cce_dist_info.best_response_policies[0])

            # Assemble the same correlation device manually, just as an example for
            # how to do non-uniform distributions of them and to test the python
            # bindings for lists of tuples works properly
            uniform_prob = 1.0 / len(policies)
            mu2 = [(uniform_prob, policy) for policy in policies]
            cce_dist_info2 = pyspiel.cce_dist(game, mu2)
            self.assertAlmostEqual(cce_dist_info2.dist_value,
                                   sum(cce_dist_info.deviation_incentives))
            # Test the CEDist function too, why not. Disable the exact one, as it
            # takes too long for a test.
            # ce_dist_info = pyspiel.ce_dist(game, pyspiel.determinize_corr_dev(mu))
            ce_dist_info = pyspiel.ce_dist(
                game, pyspiel.sampled_determinize_corr_dev(mu, 100))
            print(
                "goofspiel, ce test num_iters: {}, ce_dist: {}, per player: {}"
                .format(num_iterations, ce_dist_info.dist_value,
                        ce_dist_info.deviation_incentives))
            print("number of conditional best responses per player:")
            for p in range(game.num_players()):
                print("  player {}, num: {}".format(
                    p,
                    len(ce_dist_info.conditional_best_response_policies[p])))
Ejemplo n.º 11
0
def counterfactual_regret_minimization(seq_game,
                                       number_of_iterations,
                                       compute_metrics=False):
    # freq_iteration_printing = number_of_iterations // 10
    cfr_solver = cfr.CFRSolver(seq_game)
    tick_time = time.time()
    # print("CFRSolver initialized.")
    for _ in range(number_of_iterations):
        cfr_solver.evaluate_and_update_policy()
        # if i % freq_iteration_printing == 0:
        #   print(f"Iteration {i}")
    timing = time.time() - tick_time
    # print("Finish.")
    if compute_metrics:
        nash_conv = exploitability.nash_conv(seq_game,
                                             cfr_solver.average_policy())
        return timing, cfr_solver.average_policy(), nash_conv
    return timing, cfr_solver.average_policy()
Ejemplo n.º 12
0
                print(f"saving to: {save_prefix + '_times.npy'}")
                np.save(save_prefix + '_times', np.array(times))
                print(f"saving to: {save_prefix + '_exps.npy'}")
                np.save(save_prefix + '_exps', np.array(exps))
                print(f"saving to: {save_prefix + '_episodes.npy'}")
                np.save(save_prefix + '_episodes', np.array(episodes))
                if algorithm == 'cfr':
                    cfr_infostates.append(solver.num_infostates_expanded)
                    print("Num infostates expanded (mil): ",
                          solver.num_infostates_expanded / 1e6)
                    print(f"saving to: {save_prefix + '_infostates.npy'}")
                    np.save(save_prefix + '_infostates',
                            np.array(cfr_infostates))

    if algorithm == 'cfr':
        solver = cfr.CFRSolver(game)
        run(solver, iterations)
    elif algorithm == 'xfp':
        solver = fictitious_play.XFPSolver(game)
        run(solver, iterations)
    elif algorithm == 'xdo':
        brs = []
        info_test = []
        for i in range(2):
            br_info = exploitability.best_response(
                game,
                cfr.CFRSolver(game).average_policy(), i)
            full_br_policy = _full_best_response_policy(
                br_info["best_response_action"])
            info_sets = br_info['info_sets']
            info_test.append(info_sets)