def cfr(game, num_iters=10000): # regrets is a dictionary where the keys are the information sets and values # are dictionaries from actions available in that information set to the # counterfactual regret for not playing that action in that information set. # Since information sets encode the player, we only require one dictionary. regrets = dict() # Similarly, action_counts is a dictionary with keys the information sets # and values dictionaries from actions to action counts. action_counts = dict() # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the # strategy at time t + 1. strategy_t = dict() strategy_t_1 = dict() average_strategy = None average_strategy_snapshot = None # Each information set is uniquely identified with an action tuple. values = {1: [], 2: []} for t in range(num_iters): for i in [1, 2]: cfr_recursive(game, game.game.root, i, t, 1.0, 1.0, regrets, action_counts, strategy_t, strategy_t_1) if (t % 100 == 0) and (average_strategy is not None): print("t: {}".format(t)) if average_strategy_snapshot is not None: snapshot_distance = compare_strategies(average_strategy, average_strategy_snapshot) print("Distance between strategies (t - 100): {}".format( snapshot_distance)) # If the snapshot distance is small enough, then return the # average strategy. This means that Euclidean distance between # the strategy at time t and at time t - 100 is small, which is # hopefully sufficient for convergence. if snapshot_distance < 1e-5: return average_strategy average_strategy_snapshot = average_strategy.copy() average_strategy = compute_average_strategy(action_counts) # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside # cfr_recursive. We take a copy because we update it inside # cfr_recursive, and want to hold on to strategy_t_1 separately to # compare. strategy_t = strategy_t_1.copy() if t % 1000 == 0: # We also compute the best response to the current strategy. complete_strategy = game.game.complete_strategy_randomly(strategy_t) exploitability = best_response.compute_exploitability( game.game, complete_strategy) print("Exploitability: {}".format(exploitability)) return average_strategy
def test_strategy_always_raises_on_leduc(): game = leduc.Leduc.create_game(3) # The strategy that always raises. strategy_raises = example_strategy.constant_action(game, 1, 2) strategy_raises.update(example_strategy.constant_action(game, 2, 2)) exploitability_raises = best_response.compute_exploitability( game, strategy_raises) # This is just expected because it is consistently computed expected_exploitability = 11.5999999999 eps = 1e-10 assert abs(exploitability_raises - expected_exploitability) < eps
def test_strategy_always_calls_on_leduc(): game = leduc.Leduc.create_game(3) # The strategy that always calls. strategy_calls = example_strategy.constant_action(game, 1, 1) strategy_calls.update(example_strategy.constant_action(game, 2, 1)) exploitability_calls = best_response.compute_exploitability( game, strategy_calls) # This is just expected because it is consistently computed expected_exploitability = 4.266666666666 eps = 1e-10 assert abs(exploitability_calls - expected_exploitability) < eps
def test_strategy_random_on_leduc(): game = leduc.Leduc.create_game(3) # Set the seed. seed = 2 np.random.seed(seed) # The strategy that always raises strategy_random = example_strategy.random_strategy(game, 1) strategy_random.update(example_strategy.random_strategy(game, 2)) exploitability_random = best_response.compute_exploitability( game, strategy_random) # This is just expected because it is consistently computed expected_exploitability = 5.297170632756803 eps = 1e-10 assert abs(exploitability_random - expected_exploitability) < eps