def test_deep_cfr_runs(self, game_name): game = pyspiel.load_game(game_name) deep_cfr_solver = deep_cfr.DeepCFRSolver(game, policy_network_layers=(8, 4), advantage_network_layers=(4, 2), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=8, batch_size_strategy=8, memory_capacity=1e7) deep_cfr_solver.solve()
def test_matching_pennies_3p(self): # We don't expect Deep CFR to necessarily converge on 3-player games but # it's nonetheless interesting to see this result. game = pyspiel.load_game_as_turn_based('matching_pennies_3p') deep_cfr_solver = deep_cfr.DeepCFRSolver(game, policy_network_layers=(16, 8), advantage_network_layers=(32, 16), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=8, batch_size_strategy=8, memory_capacity=1e7) deep_cfr_solver.solve() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities)) print('Deep CFR in Matching Pennies 3p. NashConv: {}'.format(conv))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) deep_cfr_solver = deep_cfr.DeepCFRSolver( game, policy_network_layers=(64, 64, 64), advantage_network_layers=(64, 64, 64), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=2048, batch_size_strategy=2048, memory_capacity=1e7, policy_network_train_steps=5000, advantage_network_train_steps=750, reinitialize_advantage_networks=True) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in advantage_losses.items(): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) conv = exploitability.nash_conv(game, average_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Computed player 1 value: {}".format(average_policy_values[1]))