Esempio n. 1
0
def cfr_traverse(game: extensive_game.ExtensiveGame, action_indexer: neural_game.ActionIndexer,
                 info_set_vectoriser: neural_game.InfoSetVectoriser,
                 node: extensive_game.ExtensiveGameNode, player: int,
                 network1: RegretPredictor, network2: RegretPredictor,
                 advantage_memory1: buffer.Reservoir, advantage_memory2: buffer.Reservoir,
                 strategy_memory: buffer.Reservoir, t: int):
    """

    Args:
        game: ExtensiveGame.
        action_indexer: ActionIndexer. This maps actions to indices, so that we can use neural networks.
        info_set_vectoriser: InfoSetVectoriser. This maps information sets to vectors, so we can use neural networks.
        node: ExtensiveGameNode. The current node.
        player: int. The traversing player. Either 1 or 2.
        network1: RegretPredictor. The network for player 1.
        network2: RegretPredictor. The network for player 2.
        advantage_memory1: Reservoir. The advantage memory for player 1.
        advantage_memory2: Reservoir. The advantage memory for player 2.
        strategy_memory: Reservoir. The strategy memory (for both players).
        t: int. The current iteration of deep cfr.

    Returns:

    """
    if is_terminal(node):
        return payoffs(node)[player]
    elif which_player(node) == 0:
        # Chance player
        a = sample_chance_action(node)
        return cfr_traverse(game, action_indexer, info_set_vectoriser, node.children[a], player,
                            network1, network2,
                            advantage_memory1, advantage_memory2, strategy_memory, t)
    elif which_player(node) == player:
        # It's the traversing player's turn.
        state_vector = info_set_vectoriser.get_vector(game.get_info_set_id(node))
        values = dict()
        for action in get_available_actions(node):
            child = node.children[action]
            values[action] = cfr_traverse(game, action_indexer, info_set_vectoriser, child, player,
                                          network1, network2,
                                          advantage_memory1, advantage_memory2, strategy_memory, t)
            assert values[action] is not None, print("Shouldn't be None! node was: {}".format(node))
        info_set_regrets = dict()

        # Compute the player's strategy
        network = network1 if player == 1 else network2
        if t == 1:
            # This is the equivalent of initialising the network so it starts with all zeroes.
            info_set_strategy = extensive_game.ActionFloat.initialise_uniform(action_indexer.actions)
        else:
            info_set_strategy = network.compute_action_probs(state_vector, action_indexer)

        sampled_counterfactual_value = sum([info_set_strategy[action] * values[action] for action in
                                            get_available_actions(
            node)])
        for action in get_available_actions(node):
            info_set_regrets[action] = values[action] - sampled_counterfactual_value

        info_set_id = game.info_set_ids[node]
        advantage_memory = advantage_memory1 if player == 1 else advantage_memory2
        advantage_memory.append(AdvantageMemoryElement(info_set_id, t, info_set_regrets))

        # In traverser infosets, the value passed back up is the weighted average of all action values,
        # where action a’s weight is info_set_strategy[a]
        return sampled_counterfactual_value
    else:
        # It's the other player's turn.
        state_vector = info_set_vectoriser.get_vector(game.get_info_set_id(node))

        # Compute the other player's strategy
        other_player = 1 if player == 2 else 2
        network = network1 if other_player == 1 else network2
        if t == 1:
            # This is the equivalent of initialising the network so it starts with all zeroes.
            info_set_strategy = extensive_game.ActionFloat.initialise_uniform(action_indexer.actions)
        else:
            info_set_strategy = network.compute_action_probs(state_vector, action_indexer)

        info_set_id = game.info_set_ids[node]
        strategy_memory.append(StrategyMemoryElement(info_set_id, t, info_set_strategy))

        action = sample_action(info_set_strategy, available_actions=get_available_actions(node))
        return cfr_traverse(game, action_indexer, info_set_vectoriser, node.children[action], player,
                            network1, network2, advantage_memory1, advantage_memory2, strategy_memory, t)
Esempio n. 2
0
def cfr_recursive(game,
                  node,
                  i,
                  t,
                  pi_c,
                  pi_1,
                  pi_2,
                  regrets,
                  action_counts,
                  strategy_t,
                  strategy_t_1,
                  use_chance_sampling=False):
    # If the node is terminal, just return the payoffs
    if is_terminal(node):
        return payoffs(node)[i]
    # If the next player is chance, then sample the chance action
    elif which_player(node) == 0:
        if use_chance_sampling:
            a = sample_chance_action(node)
            return cfr_recursive(game,
                                 node.children[a],
                                 i,
                                 t,
                                 pi_c,
                                 pi_1,
                                 pi_2,
                                 regrets,
                                 action_counts,
                                 strategy_t,
                                 strategy_t_1,
                                 use_chance_sampling=use_chance_sampling)
        else:
            value = 0
            for a, cp in node.chance_probs.items():
                value += cp * cfr_recursive(
                    game,
                    node.children[a],
                    i,
                    t,
                    cp * pi_c,
                    pi_1,
                    pi_2,
                    regrets,
                    action_counts,
                    strategy_t,
                    strategy_t_1,
                    use_chance_sampling=use_chance_sampling)
            return value

    # Get the information set
    information_set = get_information_set(game, node)

    # Get the player to play and initialise values
    player = which_player(node)
    value = 0
    available_actions = get_available_actions(node)
    values_Itoa = {a: 0 for a in available_actions}

    # Initialise strategy_t[information_set] uniformly at random.
    if information_set not in strategy_t:
        strategy_t[information_set] = {
            a: 1.0 / float(len(available_actions))
            for a in available_actions
        }

    # Compute the counterfactual value of this information set by computing the counterfactual
    # value of the information sets where the player plays each available action and taking
    # the expected value (by weighting by the strategy).
    for a in available_actions:
        if player == 1:
            values_Itoa[a] = cfr_recursive(
                game,
                node.children[a],
                i,
                t,
                pi_c,
                strategy_t[information_set][a] * pi_1,
                pi_2,
                regrets,
                action_counts,
                strategy_t,
                strategy_t_1,
                use_chance_sampling=use_chance_sampling)
        else:
            values_Itoa[a] = cfr_recursive(
                game,
                node.children[a],
                i,
                t,
                pi_c,
                pi_1,
                strategy_t[information_set][a] * pi_2,
                regrets,
                action_counts,
                strategy_t,
                strategy_t_1,
                use_chance_sampling=use_chance_sampling)
        value += strategy_t[information_set][a] * values_Itoa[a]

    # Update regrets now that we have computed the counterfactual value of the
    # information set as well as the counterfactual values of playing each
    # action in the information set. First initialise regrets with this
    # information set if necessary.
    if information_set not in regrets:
        regrets[information_set] = {ad: 0.0 for ad in available_actions}
    if player == i:
        for a in available_actions:
            pi_minus_i = pi_c * pi_1 if i == 2 else pi_c * pi_2
            pi_i = pi_1 if i == 1 else pi_2
            regrets[information_set][a] += (values_Itoa[a] -
                                            value) * pi_minus_i
            if information_set not in action_counts:
                action_counts[information_set] = {
                    ad: 0.0
                    for ad in available_actions
                }
            action_counts[information_set][a] += pi_c * pi_i * \
                strategy_t[information_set][a]

        # Update strategy t plus 1
        strategy_t_1[information_set] = compute_regret_matching(
            regrets[information_set])

    # Return the value
    return value
Esempio n. 3
0
def compute_node_regret_recursive(
    game: extensive_game.ExtensiveGame,
    node: extensive_game.ExtensiveGameNode,
    strategy: extensive_game.Strategy,
    node_regrets: collections.defaultdict,
    pi_1: float,
    pi_2: float,
    pi_c: float,
):
    """
    Computes the immediate counterfactual regret at each node for each player. This is defined as:
        regret_i(sigma, h, a) = u_i(sigma, ha) - u_i(sigma, h),
    where h is a player i node and u_i(sigma, h) is the counterfactual value to player i of being in node h,
    given that the strategy profile is sigma. Formally,
        u_i(sigma, h) = pi_i^sigma(h) \sum_{z in Z_h} pi^sigma(h, z) v_i(z),
    where
        v_i(z) is the utility to player i of the terminal node z.

    Args:
        game: ExtensiveGame.
        node: ExtensiveGameNode.
        strategy: strategy for both players.
        node_regrets: defaultdict mapping nodes to dictionaries mapping actions to the immediate regret of
            the player not playing the given action in the given node.
        pi_1: float.
        pi_2: float.
        pi_c: float.

    Returns:
        v1: the expected utility sum_{z in Z_h} u_1(z) pi^sigma(h, z).
        v2: the expected utility sum_{z in Z_h} u_2(z) pi^sigma(h, z).
    """
    node_player = cfr_game.which_player(node)
    if cfr_game.is_terminal(node):
        return node.utility[1], node.utility[2]
    elif node_player in [1, 2]:
        v1 = 0.0
        v2 = 0.0
        information_set = cfr_game.get_information_set(game, node)
        values_1 = dict()
        values_2 = dict()
        for action, child in node.children.items():
            pi_1_new = pi_1 * strategy[information_set][
                action] if node_player == 1 else pi_1
            pi_2_new = pi_2 * strategy[information_set][
                action] if node_player == 2 else pi_2
            values_1[action], values_2[action] = compute_node_regret_recursive(
                game,
                child,
                strategy,
                node_regrets,
                pi_1_new,
                pi_2_new,
                pi_c,
            )
            action_prob = strategy[information_set][
                action] if node_player == 1 else strategy[information_set][
                    action]
            v1 += action_prob * values_1[action]
            v2 += action_prob * values_2[action]

        # Compute the immediate regret for the player in the node h for not playing each action.
        for action in node.children.keys():
            if node_player == 1:
                node_regrets[node][action] = pi_c * pi_2 * (values_1[action] -
                                                            v1)
            elif node_player == 2:
                node_regrets[node][action] = pi_c * pi_1 * (values_2[action] -
                                                            v2)

        return v1, v2
    elif node_player == 0:
        # Chance player.
        v1 = 0.0
        v2 = 0.0
        for action, child in node.children.items():
            chance_prob = node.chance_probs[action]
            v1a, v2a = compute_node_regret_recursive(game, child, strategy,
                                                     node_regrets, pi_1, pi_2,
                                                     pi_c * chance_prob)
            v1 += chance_prob * v1a
            v2 += chance_prob * v2a

        return v1, v2
Esempio n. 4
0
def cfr_recursive(game, node, i, t, pi_c, pi_1, pi_2, regrets: typing.Dict[typing.Any, ActionFloat],
                  action_counts, strategy_t, strategy_t_1, cfr_state: cfr_util.CFRState,
                  use_chance_sampling=False, weight=1.0):
    cfr_state.node_touched()
    # If the node is terminal, just return the payoffs
    if is_terminal(node):
        return payoffs(node)[i]
    # If the next player is chance, then sample the chance action
    elif which_player(node) == 0:
        if use_chance_sampling:
            a = sample_chance_action(node)
            return cfr_recursive(
                game, node.children[a], i, t, pi_c, pi_1, pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight,
            )
        else:
            value = 0
            for a, cp in node.chance_probs.items():
                value += cp * cfr_recursive(
                    game, node.children[a], i, t, cp * pi_c, pi_1, pi_2,
                    regrets, action_counts, strategy_t, strategy_t_1,
                    cfr_state,
                    use_chance_sampling=use_chance_sampling,
                    weight=weight,
                )
            return value

    # Get the information set
    information_set = get_information_set(game, node)

    # Get the player to play and initialise values
    player = which_player(node)
    value = 0
    available_actions = get_available_actions(node)
    values_Itoa = {a: 0 for a in available_actions}

    # Initialise strategy_t[information_set] uniformly at random.
    if information_set not in strategy_t.get_info_sets():
        strategy_t.set_uniform_action_probs(information_set, available_actions)

    # Compute the counterfactual value of this information set by computing the counterfactual
    # value of the information sets where the player plays each available action and taking
    # the expected value (by weighting by the strategy).
    for a in available_actions:
        if player == 1:
            values_Itoa[a] = cfr_recursive(
                game, node.children[a], i, t, pi_c,
                strategy_t.get_action_probs(information_set)[a] * pi_1, pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight,
            )
        else:
            values_Itoa[a] = cfr_recursive(
                game, node.children[a], i, t, pi_c,
                pi_1, strategy_t[information_set][a] * pi_2,
                regrets, action_counts, strategy_t, strategy_t_1,
                cfr_state,
                use_chance_sampling=use_chance_sampling,
                weight=weight
            )
        value += strategy_t[information_set][a] * values_Itoa[a]

    # Update regrets now that we have computed the counterfactual value of the
    # information set as well as the counterfactual values of playing each
    # action in the information set. First initialise regrets with this
    # information set if necessary.
    if information_set not in regrets:
        regrets[information_set] = ActionFloat.initialise_zero(available_actions)
    if player == i:
        if information_set not in action_counts:
            action_counts[information_set] = ActionFloat.initialise_zero(available_actions)

        action_counts_to_add = {a: 0.0 for a in available_actions}
        regrets_to_add = {a: 0.0 for a in available_actions}
        for a in available_actions:
            pi_minus_i = pi_c * pi_1 if i == 2 else pi_c * pi_2
            pi_i = pi_1 if i == 1 else pi_2
            regrets_to_add[a] = weight * (values_Itoa[a] - value) * pi_minus_i
            # action_counts_to_add[a] = pi_c * pi_i * strategy_t[information_set][a]
            action_counts_to_add[a] = weight * pi_i * strategy_t[information_set][a]

        # Update the regrets and action counts.
        regrets[information_set] = ActionFloat.sum(regrets[information_set], ActionFloat(regrets_to_add))
        action_counts[information_set] = ActionFloat.sum(
            action_counts[information_set],
            ActionFloat(action_counts_to_add)
        )

        # Update strategy t plus 1
        strategy_t_1[information_set] = cfr_util.compute_regret_matching(regrets[information_set])

    # Return the value
    return value
Esempio n. 5
0
def compute_expected_utility_recursive(
    game: extensive_game.ExtensiveGame,
    node: extensive_game.ExtensiveGameNode,
    sigma_1: extensive_game.Strategy,
    sigma_2: extensive_game.Strategy,
    expected_utility_1: Dict[extensive_game.ExtensiveGameNode, float],
    expected_utility_2: Dict[extensive_game.ExtensiveGameNode, float],
    pi_1: float,
    pi_2: float,
    pi_c: float,
):
    """
    Computes the expected utility of the given node for each player. This is defined as
        v_i(sigma, h) = sum_{z in Z_h} u_i(z) pi^sigma(h, z),
    where Z_h is the set of terminal nodes with h as a prefix, and pi^sigma(h, z) is the product of all
    probabilities in the strategy profile sigma on the route from h to z.

    Args:
        game: the game.
        node: the current node.
        sigma_1: player 1 strategy.
        sigma_2: player 2 strategy.
        expected_utility_1: dictionary mapping player 1 nodes to their utility. We fill this in.
        expected_utility_2: dictionary mapping player 2 nodes to their utility. We fill this in.
        pi_1: the reach probability of the node for player 1.
        pi_2: the reach probability of the node for player 2.
        pi_c: the reach probability of the node for the chance player.

    Returns:
        v_1: float. The expected utility of the given node.
        v_2: float. The expected utility of the given node.
    """
    node_player = cfr_game.which_player(node)
    if cfr_game.is_terminal(node):
        return node.utility[1], node.utility[2]
    elif node_player in [1, 2]:
        v1 = 0.0
        v2 = 0.0
        information_set = cfr_game.get_information_set(game, node)
        for action, child in node.children.items():
            pi_1_new = pi_1 * sigma_1[information_set][
                action] if node_player == 1 else pi_1
            pi_2_new = pi_2 * sigma_2[information_set][
                action] if node_player == 2 else pi_2
            v1a, v2a = compute_expected_utility_recursive(
                game,
                child,
                sigma_1,
                sigma_2,
                expected_utility_1,
                expected_utility_2,
                pi_1_new,
                pi_2_new,
                pi_c,
            )
            action_prob = sigma_1[information_set][
                action] if node_player == 1 else sigma_2[information_set][
                    action]
            v1 += action_prob * v1a
            v2 += action_prob * v2a

        if node_player == 1:
            expected_utility_1[node] += v1
        elif node_player == 2:
            expected_utility_2[node] += v2

        return v1, v2
    elif node_player == 0:
        # Chance player.
        v1 = 0.0
        v2 = 0.0
        for action, child in node.children.items():
            chance_prob = node.chance_probs[action]
            v1a, v2a = compute_expected_utility_recursive(
                game, child, sigma_1, sigma_2, expected_utility_1,
                expected_utility_2, pi_1, pi_2, pi_c * chance_prob)
            v1 += chance_prob * v1a
            v2 += chance_prob * v2a

        return v1, v2