def test_rcfr_with_buffer(self): buffer_size = 12 num_epochs = 100 num_iterations = 2 models = [_new_model() for _ in range(_GAME.num_players())] patient = rcfr.ReservoirRcfrSolver(_GAME, models, buffer_size=buffer_size) def _train(model, data): data = torch.utils.data.DataLoader( data, batch_size=_BATCH_SIZE, shuffle=True) loss_fn = nn.SmoothL1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=0.005, amsgrad=True) for _ in range(num_epochs): for x, y in data: optimizer.zero_grad() output = model(x) loss = loss_fn(output, y) loss.backward() optimizer.step() average_policy = patient.average_policy() self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91) for _ in range(num_iterations): patient.evaluate_and_update_policy(_train) average_policy = patient.average_policy() self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
def test_cfr(self): root = rcfr.RootStateWrapper(_GAME.new_initial_state()) num_half_iterations = 6 cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences] cumulative_reach_weights = [np.zeros(n) for n in root.num_player_sequences] average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) # parameterized.TestCase self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91) regret_player = 0 for _ in range(num_half_iterations): reach_weights_player = 1 if regret_player == 0 else 0 regrets, reach = root.counterfactual_regrets_and_reach_weights( regret_player, reach_weights_player, *rcfr.relu(cumulative_regrets)) cumulative_regrets[regret_player] += regrets cumulative_reach_weights[reach_weights_player] += reach regret_player = reach_weights_player average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.27)
def test_rcfr_with_buffer(self): buffer_size = 12 num_epochs = 100 num_iterations = 2 models = [_new_model() for _ in range(_GAME.num_players())] patient = rcfr.ReservoirRcfrSolver(_GAME, models, buffer_size=buffer_size) def _train(model, data): data = data.shuffle(12) data = data.batch(12) data = data.repeat(num_epochs) optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True) for x, y in data: optimizer.minimize( lambda: tf.losses.huber_loss(y, model(x)), # pylint: disable=cell-var-from-loop model.trainable_variables) average_policy = patient.average_policy() self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91) for _ in range(num_iterations): patient.evaluate_and_update_policy(_train) average_policy = patient.average_policy() self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
def test_rcfr_functions(self): models = [_new_model() for _ in range(_GAME.num_players())] root = rcfr.RootStateWrapper(_GAME.new_initial_state()) num_half_iterations = 4 num_epochs = 100 cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences] cumulative_reach_weights = [ np.zeros(n) for n in root.num_player_sequences ] average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91) regret_player = 0 sequence_weights = [ model(root.sequence_features[player]).numpy() for player, model in enumerate(models) ] for _ in range(num_half_iterations): reach_weights_player = 1 if regret_player == 0 else 0 sequence_weights[reach_weights_player] = models[ reach_weights_player]( root.sequence_features[reach_weights_player]).numpy() regrets, seq_probs = root.counterfactual_regrets_and_reach_weights( regret_player, reach_weights_player, *sequence_weights) cumulative_regrets[regret_player] += regrets cumulative_reach_weights[reach_weights_player] += seq_probs data = tf.data.Dataset.from_tensor_slices( (root.sequence_features[regret_player], tf.expand_dims(cumulative_regrets[regret_player], axis=1))) data = data.shuffle(12) data = data.batch(12) data = data.repeat(num_epochs) optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True) for x, y in data: optimizer.minimize( lambda: tf.compat.v1.losses.huber_loss( y, models[regret_player](x)), # pylint: disable=cell-var-from-loop models[regret_player].trainable_variables) regret_player = reach_weights_player average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.91)
def test_rcfr_functions(self): models = [_new_model() for _ in range(_GAME.num_players())] root = rcfr.RootStateWrapper(_GAME.new_initial_state()) num_half_iterations = 4 num_epochs = 100 cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences] cumulative_reach_weights = [np.zeros(n) for n in root.num_player_sequences] average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91) regret_player = 0 sequence_weights = [ model(root.sequence_features[player]).detach().numpy() for player, model in enumerate(models) ] for _ in range(num_half_iterations): reach_weights_player = 1 if regret_player == 0 else 0 sequence_weights[reach_weights_player] = models[reach_weights_player]( root.sequence_features[reach_weights_player]).detach().numpy() regrets, seq_probs = root.counterfactual_regrets_and_reach_weights( regret_player, reach_weights_player, *sequence_weights) cumulative_regrets[regret_player] += regrets cumulative_reach_weights[reach_weights_player] += seq_probs data = torch.utils.data.TensorDataset( root.sequence_features[regret_player], torch.unsqueeze( torch.Tensor(cumulative_regrets[regret_player]), axis=1)) data = torch.utils.data.DataLoader( data, batch_size=_BATCH_SIZE, shuffle=True) loss_fn = nn.SmoothL1Loss() optimizer = torch.optim.Adam( models[regret_player].parameters(), lr=0.005, amsgrad=True) for _ in range(num_epochs): for x, y in data: optimizer.zero_grad() output = models[regret_player](x) loss = loss_fn(output, y) loss.backward() optimizer.step() regret_player = reach_weights_player average_profile = root.sequence_weights_to_tabular_profile( cumulative_reach_weights) self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.91)
def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv( self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) # TabularPolicy defaults to being a uniform random policy. test_policy = policy.TabularPolicy(game) python_nash_conv = exploitability.nash_conv(game, test_policy) cpp_nash_conv = pyspiel.nash_conv( game, policy_utils.policy_to_dict(test_policy, game)) self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_cpp_python_cfr_kuhn(self): game = pyspiel.load_game("kuhn_poker") solver = pyspiel.CFRSolver(game) for _ in range(100): solver.evaluate_and_update_policy() pyspiel_average_policy = solver.tabular_average_policy() cpp_nash_conv = pyspiel.nash_conv(game, pyspiel_average_policy) python_policy = policy.pyspiel_policy_to_python_policy( game, pyspiel_average_policy) python_nash_conv = exploitability.nash_conv(game, python_policy) self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
def test_neurd(self): num_iterations = 2 models = [_new_model() for _ in range(_GAME.num_players())] solver = neurd.CounterfactualNeurdSolver(_GAME, models) average_policy = solver.average_policy() self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91) @tf.function def _train(model, data): neurd.train(model=model, data=data, batch_size=12, step_size=10.0, autoencoder_loss=tf.losses.huber_loss) for _ in range(num_iterations): solver.evaluate_and_update_policy(_train) average_policy = solver.average_policy() self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
def test_cpp_and_python_cfr_br(self, game, solver_cls, expected_exploitability): solver = solver_cls(game) for step in range(5): solver.evaluate_and_update_policy() # We do not compare the policy directly as we do not have an easy way to # convert one to the other, so we use the exploitability as a proxy. avg_policy = solver.average_policy() if solver_cls == pyspiel.CFRBRSolver: exploitability_ = pyspiel.nash_conv(game, avg_policy) else: exploitability_ = exploitability.nash_conv(game, avg_policy) self.assertEqual(expected_exploitability[step], exploitability_)
def test_cpp_algorithms_identical_to_python_algorithm( self, game, cpp_class, python_class): cpp_solver = cpp_class(game) python_solver = python_class(game) for _ in range(5): cpp_solver.evaluate_and_update_policy() python_solver.evaluate_and_update_policy() cpp_avg_policy = cpp_solver.average_policy() python_avg_policy = python_solver.average_policy() # We do not compare the policy directly as we do not have an easy way to # convert one to the other, so we use the exploitability as a proxy. cpp_expl = pyspiel.nash_conv(game, cpp_avg_policy) python_expl = exploitability.nash_conv(game, python_avg_policy) self.assertEqual(cpp_expl, python_expl) # Then we also check the CurrentPolicy, just to check it is giving the same # results too cpp_current_policy = cpp_solver.current_policy() python_current_policy = python_solver.current_policy() cpp_expl = pyspiel.nash_conv(game, cpp_current_policy) python_expl = exploitability.nash_conv(game, python_current_policy) self.assertEqual(cpp_expl, python_expl)
def run_iterations(game, solver, start_iteration=0): """Run iterations of MCCFR.""" for i in range(int(FLAGS.iterations / 2)): solver.run_iteration() policy = solver.average_policy() exploitability = pyspiel.exploitability(game, policy) # We also compute NashConv to highlight an important API feature: # when using Monte Carlo sampling, the policy # may not have a table entry for every info state. # Therefore, when calling nash_conv, ensure the third argument, # "use_state_get_policy" is set to True # See https://github.com/deepmind/open_spiel/issues/500 nash_conv = pyspiel.nash_conv(game, policy, True) print("Iteration {} nashconv: {:.6f} exploitability: {:.6f}".format( start_iteration + i, nash_conv, exploitability))
def test_matching_pennies_3p(self): game = pyspiel.load_game_as_turn_based('matching_pennies_3p') deep_cfr_solver = deep_cfr.DeepCFRSolver(game, policy_network_layers=(16, 8), advantage_network_layers=(32, 16), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) deep_cfr_solver.solve() conv = pyspiel.nash_conv( game, policy.python_policy_to_pyspiel_policy( policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities))) logging.info('Deep CFR in Matching Pennies 3p. NashConv: %.2f', conv)
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) deep_cfr_solver = deep_cfr.DeepCFRSolver( game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=int(1e7)) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy) conv = pyspiel.nash_conv(game, pyspiel_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) logging.info("Computed player 0 value: %.2f (expected: %.2f).", average_policy_values[0], -1 / 18) logging.info("Computed player 1 value: %.2f (expected: %.2f).", average_policy_values[1], 1 / 18)