Beispiel #1
0
    def test_sample_chance_action(self):

        # Check we get an exception if the node isn't player 0.
        node = extensive_game.ExtensiveGameNode(player=1,
                                                action_list=(1, 2, 3),
                                                hidden_from={3})
        with self.assertRaises(ValueError):
            cfr_game.sample_chance_action(node)

        # Check we can sample a chance action.
        node = extensive_game.ExtensiveGameNode(player=0,
                                                action_list=(1, 2, 3),
                                                hidden_from={3})
Beispiel #2
0
def cfr_traverse(game: extensive_game.ExtensiveGame, action_indexer: neural_game.ActionIndexer,
                 info_set_vectoriser: neural_game.InfoSetVectoriser,
                 node: extensive_game.ExtensiveGameNode, player: int,
                 network1: RegretPredictor, network2: RegretPredictor,
                 advantage_memory1: buffer.Reservoir, advantage_memory2: buffer.Reservoir,
                 strategy_memory: buffer.Reservoir, t: int):
    """

    Args:
        game: ExtensiveGame.
        action_indexer: ActionIndexer. This maps actions to indices, so that we can use neural networks.
        info_set_vectoriser: InfoSetVectoriser. This maps information sets to vectors, so we can use neural networks.
        node: ExtensiveGameNode. The current node.
        player: int. The traversing player. Either 1 or 2.
        network1: RegretPredictor. The network for player 1.
        network2: RegretPredictor. The network for player 2.
        advantage_memory1: Reservoir. The advantage memory for player 1.
        advantage_memory2: Reservoir. The advantage memory for player 2.
        strategy_memory: Reservoir. The strategy memory (for both players).
        t: int. The current iteration of deep cfr.

    Returns:

    """
    if is_terminal(node):
        return payoffs(node)[player]
    elif which_player(node) == 0:
        # Chance player
        a = sample_chance_action(node)
        return cfr_traverse(game, action_indexer, info_set_vectoriser, node.children[a], player,
                            network1, network2,
                            advantage_memory1, advantage_memory2, strategy_memory, t)
    elif which_player(node) == player:
        # It's the traversing player's turn.
        state_vector = info_set_vectoriser.get_vector(game.get_info_set_id(node))
        values = dict()
        for action in get_available_actions(node):
            child = node.children[action]
            values[action] = cfr_traverse(game, action_indexer, info_set_vectoriser, child, player,
                                          network1, network2,
                                          advantage_memory1, advantage_memory2, strategy_memory, t)
            assert values[action] is not None, print("Shouldn't be None! node was: {}".format(node))
        info_set_regrets = dict()

        # Compute the player's strategy
        network = network1 if player == 1 else network2
        if t == 1:
            # This is the equivalent of initialising the network so it starts with all zeroes.
            info_set_strategy = extensive_game.ActionFloat.initialise_uniform(action_indexer.actions)
        else:
            info_set_strategy = network.compute_action_probs(state_vector, action_indexer)

        sampled_counterfactual_value = sum([info_set_strategy[action] * values[action] for action in
                                            get_available_actions(
            node)])
        for action in get_available_actions(node):
            info_set_regrets[action] = values[action] - sampled_counterfactual_value

        info_set_id = game.info_set_ids[node]
        advantage_memory = advantage_memory1 if player == 1 else advantage_memory2
        advantage_memory.append(AdvantageMemoryElement(info_set_id, t, info_set_regrets))

        # In traverser infosets, the value passed back up is the weighted average of all action values,
        # where action a’s weight is info_set_strategy[a]
        return sampled_counterfactual_value
    else:
        # It's the other player's turn.
        state_vector = info_set_vectoriser.get_vector(game.get_info_set_id(node))

        # Compute the other player's strategy
        other_player = 1 if player == 2 else 2
        network = network1 if other_player == 1 else network2
        if t == 1:
            # This is the equivalent of initialising the network so it starts with all zeroes.
            info_set_strategy = extensive_game.ActionFloat.initialise_uniform(action_indexer.actions)
        else:
            info_set_strategy = network.compute_action_probs(state_vector, action_indexer)

        info_set_id = game.info_set_ids[node]
        strategy_memory.append(StrategyMemoryElement(info_set_id, t, info_set_strategy))

        action = sample_action(info_set_strategy, available_actions=get_available_actions(node))
        return cfr_traverse(game, action_indexer, info_set_vectoriser, node.children[action], player,
                            network1, network2, advantage_memory1, advantage_memory2, strategy_memory, t)
Beispiel #3
0
def cfr_recursive(game,
                  node,
                  i,
                  t,
                  pi_c,
                  pi_1,
                  pi_2,
                  regrets,
                  action_counts,
                  strategy_t,
                  strategy_t_1,
                  use_chance_sampling=False):
    # If the node is terminal, just return the payoffs
    if is_terminal(node):
        return payoffs(node)[i]
    # If the next player is chance, then sample the chance action
    elif which_player(node) == 0:
        if use_chance_sampling:
            a = sample_chance_action(node)
            return cfr_recursive(game,
                                 node.children[a],
                                 i,
                                 t,
                                 pi_c,
                                 pi_1,
                                 pi_2,
                                 regrets,
                                 action_counts,
                                 strategy_t,
                                 strategy_t_1,
                                 use_chance_sampling=use_chance_sampling)
        else:
            value = 0
            for a, cp in node.chance_probs.items():
                value += cp * cfr_recursive(
                    game,
                    node.children[a],
                    i,
                    t,
                    cp * pi_c,
                    pi_1,
                    pi_2,
                    regrets,
                    action_counts,
                    strategy_t,
                    strategy_t_1,
                    use_chance_sampling=use_chance_sampling)
            return value

    # Get the information set
    information_set = get_information_set(game, node)

    # Get the player to play and initialise values
    player = which_player(node)
    value = 0
    available_actions = get_available_actions(node)
    values_Itoa = {a: 0 for a in available_actions}

    # Initialise strategy_t[information_set] uniformly at random.
    if information_set not in strategy_t:
        strategy_t[information_set] = {
            a: 1.0 / float(len(available_actions))
            for a in available_actions
        }

    # Compute the counterfactual value of this information set by computing the counterfactual
    # value of the information sets where the player plays each available action and taking
    # the expected value (by weighting by the strategy).
    for a in available_actions:
        if player == 1:
            values_Itoa[a] = cfr_recursive(
                game,
                node.children[a],
                i,
                t,
                pi_c,
                strategy_t[information_set][a] * pi_1,
                pi_2,
                regrets,
                action_counts,
                strategy_t,
                strategy_t_1,
                use_chance_sampling=use_chance_sampling)
        else:
            values_Itoa[a] = cfr_recursive(
                game,
                node.children[a],
                i,
                t,
                pi_c,
                pi_1,
                strategy_t[information_set][a] * pi_2,
                regrets,
                action_counts,
                strategy_t,
                strategy_t_1,
                use_chance_sampling=use_chance_sampling)
        value += strategy_t[information_set][a] * values_Itoa[a]

    # Update regrets now that we have computed the counterfactual value of the
    # information set as well as the counterfactual values of playing each
    # action in the information set. First initialise regrets with this
    # information set if necessary.
    if information_set not in regrets:
        regrets[information_set] = {ad: 0.0 for ad in available_actions}
    if player == i:
        for a in available_actions:
            pi_minus_i = pi_c * pi_1 if i == 2 else pi_c * pi_2
            pi_i = pi_1 if i == 1 else pi_2
            regrets[information_set][a] += (values_Itoa[a] -
                                            value) * pi_minus_i
            if information_set not in action_counts:
                action_counts[information_set] = {
                    ad: 0.0
                    for ad in available_actions
                }
            action_counts[information_set][a] += pi_c * pi_i * \
                strategy_t[information_set][a]

        # Update strategy t plus 1
        strategy_t_1[information_set] = compute_regret_matching(
            regrets[information_set])

    # Return the value
    return value
Beispiel #4
0
def external_sampling_cfr_recursive(
    game: extensive_game.ExtensiveGame,
    node: extensive_game.ExtensiveGameNode,
    player: int,
    regrets: Dict,
    strategy_t: extensive_game.Strategy,
    strategy_t_1: extensive_game.Strategy,
    cfr_state: cfr_util.CFRState,
):
    """
    Computes the 'expected player utility' sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z). Samples the actions of
    chance nodes and the nodes of the other players. Accumulates the immediate sampled counterfactual regret:

    rtilde(I, a) = sum_{z in Q and Z_I} u_i(z) (pi_i^sigma(z[I]a, z) - pi_i^sigma(z[I], z)).

    Args:
        game:
        node:
        player:
        regrets:
        strategy_t: the strategy used at time t. We don't update this one.
        strategy_t_1: the strategy to use at time t + 1. We update this one in this function call.
        cfr_state: general state about CFR progress.

    Returns:
        expected_player_utility
    """
    cfr_state.node_touched()
    if node.player == -1:
        # Terminal node. Just return the utility to the player.
        return node.utility[player]
    elif node.player == 0:
        # Chance player. We sample an action and then return the expected utility for that action.
        a = cfr_game.sample_chance_action(node)
        return external_sampling_cfr_recursive(
            game,
            node.children[a],
            player,
            regrets,
            strategy_t,
            strategy_t_1,
            cfr_state,
        )
    elif node.player == player:
        # Return sum_{z in Q and Z_I} pi_i^sigma (z[I], z) u_i(z)

        expected_utilities = dict()
        action_probs = dict()
        information_set = cfr_game.get_information_set(game, node)
        expected_utility = 0.0
        if information_set not in strategy_t.get_info_sets():
            strategy_t.set_uniform_action_probs(information_set,
                                                list(node.children.keys()))

        immediate_regrets = dict()
        for action, child in node.children.items():
            expected_utilities[action] = external_sampling_cfr_recursive(
                game, child, player, regrets, strategy_t, strategy_t_1,
                cfr_state)
            action_probs[action] = strategy_t[information_set][action]

            expected_utility += action_probs[action] * expected_utilities[
                action]

        for action in node.children:
            immediate_regrets[
                action] = expected_utilities[action] - expected_utility

        if information_set not in regrets:
            regrets[information_set] = extensive_game.ActionFloat(
                immediate_regrets)
        else:
            regrets[information_set] = extensive_game.ActionFloat.sum(
                regrets[information_set],
                extensive_game.ActionFloat(immediate_regrets))

        # Update the strategy for the next iteration
        strategy_t_1[information_set] = cfr_util.compute_regret_matching(
            regrets[information_set])

        return expected_utility
    else:
        # It is the other player. Sample an action and return the value.
        information_set = cfr_game.get_information_set(game, node)
        if information_set not in strategy_t.get_info_sets():
            strategy_t.set_uniform_action_probs(information_set,
                                                list(node.children.keys()))

        a = cfr_game.sample_action(strategy_t[information_set])
        return external_sampling_cfr_recursive(
            game,
            node.children[a],
            player,
            regrets,
            strategy_t,
            strategy_t_1,
            cfr_state,
        )
Beispiel #5
0
def cfr_recursive(game, node, i, t, pi_c, pi_1, pi_2, regrets: typing.Dict[typing.Any, ActionFloat],
                  action_counts, strategy_t, strategy_t_1, cfr_state: cfr_util.CFRState,
                  use_chance_sampling=False, weight=1.0):
    cfr_state.node_touched()
    # If the node is terminal, just return the payoffs
    if is_terminal(node):
        return payoffs(node)[i]
    # If the next player is chance, then sample the chance action
    elif which_player(node) == 0:
        if use_chance_sampling:
            a = sample_chance_action(node)
            return cfr_recursive(
                game, node.children[a], i, t, pi_c, pi_1, pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight,
            )
        else:
            value = 0
            for a, cp in node.chance_probs.items():
                value += cp * cfr_recursive(
                    game, node.children[a], i, t, cp * pi_c, pi_1, pi_2,
                    regrets, action_counts, strategy_t, strategy_t_1,
                    cfr_state,
                    use_chance_sampling=use_chance_sampling,
                    weight=weight,
                )
            return value

    # Get the information set
    information_set = get_information_set(game, node)

    # Get the player to play and initialise values
    player = which_player(node)
    value = 0
    available_actions = get_available_actions(node)
    values_Itoa = {a: 0 for a in available_actions}

    # Initialise strategy_t[information_set] uniformly at random.
    if information_set not in strategy_t.get_info_sets():
        strategy_t.set_uniform_action_probs(information_set, available_actions)

    # Compute the counterfactual value of this information set by computing the counterfactual
    # value of the information sets where the player plays each available action and taking
    # the expected value (by weighting by the strategy).
    for a in available_actions:
        if player == 1:
            values_Itoa[a] = cfr_recursive(
                game, node.children[a], i, t, pi_c,
                strategy_t.get_action_probs(information_set)[a] * pi_1, pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight,
            )
        else:
            values_Itoa[a] = cfr_recursive(
                game, node.children[a], i, t, pi_c,
                pi_1, strategy_t[information_set][a] * pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight
            )
        value += strategy_t[information_set][a] * values_Itoa[a]

    # Update regrets now that we have computed the counterfactual value of the
    # information set as well as the counterfactual values of playing each
    # action in the information set. First initialise regrets with this
    # information set if necessary.
    if information_set not in regrets:
        regrets[information_set] = ActionFloat.initialise_zero(available_actions)
    if player == i:
        if information_set not in action_counts:
            action_counts[information_set] = ActionFloat.initialise_zero(available_actions)

        action_counts_to_add = {a: 0.0 for a in available_actions}
        regrets_to_add = {a: 0.0 for a in available_actions}
        for a in available_actions:
            pi_minus_i = pi_c * pi_1 if i == 2 else pi_c * pi_2
            pi_i = pi_1 if i == 1 else pi_2
            regrets_to_add[a] = weight * (values_Itoa[a] - value) * pi_minus_i
            # action_counts_to_add[a] = pi_c * pi_i * strategy_t[information_set][a]
            action_counts_to_add[a] = weight * pi_i * strategy_t[information_set][a]

        # Update the regrets and action counts.
        regrets[information_set] = ActionFloat.sum(regrets[information_set], ActionFloat(regrets_to_add))
        action_counts[information_set] = ActionFloat.sum(
            action_counts[information_set],
            ActionFloat(action_counts_to_add)
        )

        # Update strategy t plus 1
        strategy_t_1[information_set] = cfr_util.compute_regret_matching(regrets[information_set])

    # Return the value
    return value