Ejemplo n.º 1
0
def cfr(game, num_iters=10000):
    # regrets is a dictionary where the keys are the information sets and values
    # are dictionaries from actions available in that information set to the
    # counterfactual regret for not playing that action in that information set.
    # Since information sets encode the player, we only require one dictionary.
    regrets = dict()

    # Similarly, action_counts is a dictionary with keys the information sets
    # and values dictionaries from actions to action counts.
    action_counts = dict()

    # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the
    # strategy at time t + 1.
    strategy_t = dict()
    strategy_t_1 = dict()

    average_strategy = None
    average_strategy_snapshot = None

    # Each information set is uniquely identified with an action tuple.
    values = {1: [], 2: []}
    for t in range(num_iters):
        for i in [1, 2]:
            cfr_recursive(game, game.game.root, i, t, 1.0, 1.0, regrets,
                          action_counts, strategy_t, strategy_t_1)

        if (t % 100 == 0) and (average_strategy is not None):
            print("t: {}".format(t))
            if average_strategy_snapshot is not None:
                snapshot_distance = compare_strategies(average_strategy,
                                                       average_strategy_snapshot)
                print("Distance between strategies (t - 100): {}".format(
                    snapshot_distance))

                # If the snapshot distance is small enough, then return the
                # average strategy.  This means that Euclidean distance between
                # the strategy at time t and at time t - 100 is small, which is
                # hopefully sufficient for convergence.
                if snapshot_distance < 1e-5:
                    return average_strategy

            average_strategy_snapshot = average_strategy.copy()
        average_strategy = compute_average_strategy(action_counts)

        # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside
        # cfr_recursive.  We take a copy because we update it inside
        # cfr_recursive, and want to hold on to strategy_t_1 separately to
        # compare.
        strategy_t = strategy_t_1.copy()

        if t % 1000 == 0:
            # We also compute the best response to the current strategy.
            complete_strategy = game.game.complete_strategy_randomly(strategy_t)
            exploitability = best_response.compute_exploitability(
                game.game, complete_strategy)
            print("Exploitability: {}".format(exploitability))

    return average_strategy
Ejemplo n.º 2
0
def test_strategy_always_raises_on_leduc():
    game = leduc.Leduc.create_game(3)

    # The strategy that always raises.
    strategy_raises = example_strategy.constant_action(game, 1, 2)
    strategy_raises.update(example_strategy.constant_action(game, 2, 2))
    exploitability_raises = best_response.compute_exploitability(
        game, strategy_raises)

    # This is just expected because it is consistently computed
    expected_exploitability = 11.5999999999
    eps = 1e-10
    assert abs(exploitability_raises - expected_exploitability) < eps
Ejemplo n.º 3
0
def test_strategy_always_calls_on_leduc():
    game = leduc.Leduc.create_game(3)

    # The strategy that always calls.
    strategy_calls = example_strategy.constant_action(game, 1, 1)
    strategy_calls.update(example_strategy.constant_action(game, 2, 1))
    exploitability_calls = best_response.compute_exploitability(
        game, strategy_calls)

    # This is just expected because it is consistently computed
    expected_exploitability = 4.266666666666
    eps = 1e-10
    assert abs(exploitability_calls - expected_exploitability) < eps
Ejemplo n.º 4
0
def test_strategy_random_on_leduc():
    game = leduc.Leduc.create_game(3)

    # Set the seed.
    seed = 2
    np.random.seed(seed)

    # The strategy that always raises
    strategy_random = example_strategy.random_strategy(game, 1)
    strategy_random.update(example_strategy.random_strategy(game, 2))
    exploitability_random = best_response.compute_exploitability(
        game, strategy_random)

    # This is just expected because it is consistently computed
    expected_exploitability = 5.297170632756803
    eps = 1e-10
    assert abs(exploitability_random - expected_exploitability) < eps