def main(_): game = pyspiel.load_game( FLAGS.game, {"players": FLAGS.players}, ) if FLAGS.solver == "cfr": solver = pyspiel.CFRSolver(game) elif FLAGS.solver == "cfrplus": solver = pyspiel.CFRPlusSolver(game) elif FLAGS.solver == "cfrbr": solver = pyspiel.CFRBRSolver(game) for i in range(int(FLAGS.iterations / 2)): solver.evaluate_and_update_policy() print("Iteration {} exploitability: {:.6f}".format( i, pyspiel.exploitability(game, solver.average_policy()))) print("Persisting the model...") with open("{}_solver.pickle".format(FLAGS.solver), "wb") as file: pickle.dump(solver, file, pickle.HIGHEST_PROTOCOL) print("Loading the model...") with open("{}_solver.pickle".format(FLAGS.solver), "rb") as file: loaded_solver = pickle.load(file) print("Exploitability of the loaded model: {:.6f}".format( pyspiel.exploitability(game, loaded_solver.average_policy()))) for i in range(int(FLAGS.iterations / 2)): loaded_solver.evaluate_and_update_policy() print("Iteration {} exploitability: {:.6f}".format( int(FLAGS.iterations / 2) + i, pyspiel.exploitability(game, loaded_solver.average_policy())))
def main(_): game = pyspiel.load_game( FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}, ) if FLAGS.sampling == "external": solver = pyspiel.ExternalSamplingMCCFRSolver( game, avg_type=pyspiel.MCCFRAverageType.FULL, ) elif FLAGS.sampling == "outcome": solver = pyspiel.OutcomeSamplingMCCFRSolver(game) for i in range(int(FLAGS.iterations / 2)): solver.run_iteration() print("Iteration {} exploitability: {:.6f}".format( i, pyspiel.exploitability(game, solver.average_policy()))) print("Persisting the model...") with open(MODEL_FILE_NAME.format(FLAGS.sampling), "wb") as file: pickle.dump(solver, file, pickle.HIGHEST_PROTOCOL) print("Loading the model...") with open(MODEL_FILE_NAME.format(FLAGS.sampling), "rb") as file: loaded_solver = pickle.load(file) print("Exploitability of the loaded model: {:.6f}".format( pyspiel.exploitability(game, loaded_solver.average_policy()))) for i in range(int(FLAGS.iterations / 2)): solver.run_iteration() print("Iteration {} exploitability: {:.6f}".format( int(FLAGS.iterations / 2) + i, pyspiel.exploitability(game, solver.average_policy())))
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) models = [] for _ in range(game.num_players()): models.append( neurd.DeepNeurdModel( game, num_hidden_layers=FLAGS.num_hidden_layers, num_hidden_units=FLAGS.num_hidden_units, num_hidden_factors=FLAGS.num_hidden_factors, use_skip_connections=FLAGS.use_skip_connections, autoencode=FLAGS.autoencode)) solver = neurd.CounterfactualNeurdSolver(game, models) def _train(model, data): neurd.train(model, data, batch_size=FLAGS.batch_size, step_size=FLAGS.step_size, threshold=FLAGS.threshold, autoencoder_loss=(tf.compat.v1.losses.huber_loss if FLAGS.autoencode else None)) for i in range(FLAGS.iterations): solver.evaluate_and_update_policy(_train) if i % FLAGS.print_freq == 0: conv = pyspiel.exploitability(game, solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv))
def test_exploitability_uniform_random_cc(self): """Checks the exploitability of the uniform random policy using C++.""" game = pyspiel.load_game("python_kuhn_poker") test_policy = pyspiel.UniformRandomPolicy(game) expected_nash_conv = 11 / 12 self.assertAlmostEqual(pyspiel.exploitability(game, test_policy), expected_nash_conv / 2)
def main(_): tensorflow.random.set_random_seed(int(FLAGS.random_seed)) game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) # game = pyspiel.load_game(FLAGS.game) models = [] for _ in range(game.num_players()): models.append( neurd.DeepNeurdModel( game, num_hidden_layers=FLAGS.num_hidden_layers, num_hidden_units=FLAGS.num_hidden_units, num_hidden_factors=FLAGS.num_hidden_factors, use_skip_connections=FLAGS.use_skip_connections, autoencode=FLAGS.autoencode)) solver = neurd.CounterfactualNeurdSolver(game, FLAGS.alpha, models) def _train(model, data): neurd.train(model, data, batch_size=FLAGS.batch_size, step_size=FLAGS.step_size, alpha=FLAGS.alpha, threshold=FLAGS.threshold, autoencoder_loss=(tf.compat.v1.losses.huber_loss if FLAGS.autoencode else None)) conv = 100 # exploitabilities = [] # start_time = time.time() for i in range(FLAGS.iterations): # send i into the function to notify the adaptation of alpha if FLAGS.adaptive_alpha: solver.evaluate_and_update_policy( _train, current_iteration=i, alpha=FLAGS.alpha, increase=FLAGS.increase, gamma=FLAGS.gamma, adaptive_policy=FLAGS.adaptive_policy, total_iteration=FLAGS.iterations, semi_percent=FLAGS.semi_percent, exploit_rate=FLAGS.exploit_rate, conv=conv, exp_exploit_rate=FLAGS.exp_exploit_rate) else: solver.evaluate_and_update_policy(_train, alpha=FLAGS.alpha) if i % FLAGS.print_freq == 0: conv = pyspiel.exploitability(game, solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv))
def rcfr_train(unused_arg): tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) models = [ rcfr.DeepRcfrModel( game, num_hidden_layers=1, num_hidden_units=64 if FLAGS.game == "leduc_poker" else 13, num_hidden_factors=1, use_skip_connections=True) for _ in range(game.num_players()) ] patient = rcfr.RcfrSolver(game, models, False, True) exploit_history = list() exploit_idx = list() def _train(model, data): data = data.shuffle(1000) data = data.batch(12) #data = data.repeat(1) optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True) for x, y in data: optimizer.minimize( lambda: tf.losses.huber_loss(y, model(x)), # pylint: disable=cell-var-from-loop model.trainable_variables) agent_name = "rcfr" checkpoint = datetime.now() for iteration in range(FLAGS.episodes): if (iteration % 100) == 0: delta = datetime.now() - checkpoint conv = pyspiel.exploitability(game, patient.average_policy()) exploit_idx.append(iteration) exploit_history.append(conv) print( "[RCFR] Iteration {} exploitability {} - {} seconds since last checkpoint" .format(iteration, conv, delta.seconds)) checkpoint = datetime.now() patient.evaluate_and_update_policy(_train) pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) now = datetime.now() policy = patient.average_policy() for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(FLAGS.episodes) + "episodes.csv")
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) models = [] for _ in range(game.num_players()): models.append( rcfr.DeepRcfrModel( game, num_hidden_layers=FLAGS.num_hidden_layers, num_hidden_units=FLAGS.num_hidden_units, num_hidden_factors=FLAGS.num_hidden_factors, use_skip_connections=FLAGS.use_skip_connections)) if FLAGS.buffer_size > 0: solver = rcfr.ReservoirRcfrSolver( game, models, FLAGS.buffer_size, truncate_negative=FLAGS.truncate_negative) else: solver = rcfr.RcfrSolver(game, models, truncate_negative=FLAGS.truncate_negative, bootstrap=FLAGS.bootstrap) def _train_fn(model, data): """Train `model` on `data`.""" data = data.shuffle(FLAGS.batch_size * 10) data = data.batch(FLAGS.batch_size) data = data.repeat(FLAGS.num_epochs) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.step_size, amsgrad=True) @tf.function def _train(): for x, y in data: optimizer.minimize( lambda: tf.compat.v1.losses.huber_loss( y, model(x), delta=0.01), # pylint: disable=cell-var-from-loop model.trainable_variables) _train() # End of _train_fn for i in range(FLAGS.iterations): solver.evaluate_and_update_policy(_train_fn) if i % FLAGS.print_freq == 0: conv = pyspiel.exploitability(game, solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv))
def neurd_train(unudes_arg): tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) models = [] for _ in range(game.num_players()): models.append( neurd.DeepNeurdModel(game, num_hidden_layers=1, num_hidden_units=13, num_hidden_factors=8, use_skip_connections=True, autoencode=False)) solver = neurd.CounterfactualNeurdSolver(game, models) def _train(model, data): neurd.train(model, data, batch_size=100, step_size=1, threshold=2, autoencoder_loss=(None)) exploit_history = list() for ep in range(FLAGS.episodes): solver.evaluate_and_update_policy(_train) if ep % 100 == 0: conv = pyspiel.exploitability(game, solver.average_policy()) exploit_history.append(conv) print("Iteration {} exploitability {}".format(ep, conv)) now = datetime.now() policy = solver.average_policy() agent_name = "neurd" for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv") plt.plot([i for i in range(len(exploit_history))], exploit_history) plt.ylim(0.01, 1) plt.yticks([1, 0.1, 0.01]) plt.yscale("log") plt.xscale("log") plt.show()
def run_iterations(game, solver, start_iteration=0): """Run iterations of MCCFR.""" for i in range(int(FLAGS.iterations / 2)): solver.run_iteration() policy = solver.average_policy() exploitability = pyspiel.exploitability(game, policy) # We also compute NashConv to highlight an important API feature: # when using Monte Carlo sampling, the policy # may not have a table entry for every info state. # Therefore, when calling nash_conv, ensure the third argument, # "use_state_get_policy" is set to True # See https://github.com/deepmind/open_spiel/issues/500 nash_conv = pyspiel.nash_conv(game, policy, True) print("Iteration {} nashconv: {:.6f} exploitability: {:.6f}".format( start_iteration + i, nash_conv, exploitability))
def main(_): game = pyspiel.load_game( FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}, ) if FLAGS.sampling == "external": solver = pyspiel.ExternalSamplingMCCFRSolver( game, avg_type=pyspiel.MCCFRAverageType.FULL, ) elif FLAGS.sampling == "outcome": solver = pyspiel.OutcomeSamplingMCCFRSolver(game) for i in range(FLAGS.iterations): solver.run_iteration() print("Iteration {} exploitability: {:.6f}".format( i, pyspiel.exploitability(game, solver.average_policy())))