コード例 #1
0
def traverse(
    game: GameInterface, policy_network, metrics: Counter, level: int,
) -> GameRollout:
    if game.terminal():
        gr = GameRollout(
            torch.zeros((level, game.feature_dim()), dtype=torch.float),  # States
            torch.zeros((level, 1), dtype=torch.long),  # Actions
            torch.zeros(
                (level, game.action_dim()), dtype=torch.long
            ),  # Possible Actions
            torch.zeros((level, 1), dtype=torch.long),  # Player to act
            game.payoffs().float().repeat((level, 1)),  # Payoffs
            torch.arange(level - 1, -1, -1, dtype=torch.float).unsqueeze(
                1
            ),  # Distance to payoff
            torch.zeros((level, game.action_dim()), dtype=torch.float),  # Policy
        )
        return gr

    features = torch.zeros((game.feature_dim(),), dtype=torch.float)
    game.populate_features(features)
    player_to_act = game.get_player_to_act()
    possible_actions = game.get_one_hot_actions(True)
    num_choices = possible_actions.sum()
    metrics.update({"possible_actions_" + str(possible_actions.sum()): 1})
    has_a_choice = num_choices > 1
    if policy_network is None or not has_a_choice:
        strategy = possible_actions.float()
        active_sampling_chances = None
    else:
        strategy = policy_network(features.unsqueeze(0), possible_actions.unsqueeze(0))[
            0
        ][0]
        assert (strategy * (1 - possible_actions)).sum() == 0

    action_dist = torch.distributions.Categorical(strategy)
    if action_dist.probs.min() < 0 or action_dist.probs.max() == 0:
        print("Invalid action dist:", action_dist.probs)
    if strategy.min() < 0 or strategy.max() == 0:
        print("Invalid strategy:", strategy)

    metrics.update({"visit_level_" + str(level): 1})
    metrics["visit"] += 1
    if metrics["visit"] % 100000 == 0:
        print("Visits", metrics["visit"])

    action_taken = int(action_dist.sample().item())
    game.act(player_to_act, action_taken)
    if has_a_choice:
        result = traverse(game, policy_network, metrics, level + 1,)
        payoff = result.payoffs[player_to_act]
        result.states[level] = features
        result.actions[level] = action_taken
        result.player_to_act[level] = player_to_act
        result.possible_actions[level] = possible_actions
        result.policy[level] = strategy
    else:
        # Don't advance the level, skip this non-choice
        result = traverse(game, policy_network, metrics, level,)
    return result
コード例 #2
0
def traverse(
    game: GameInterface,
    player_to_train: int,
    regretModels: List[Optional[FullyConnectedForward]],
    playerRegret: ExpandableTensorSet,
    strategyModels: List[Optional[FullyConnectedForward]],
    strategyData: ExpandableTensorSet,
    metrics: Counter,
    level: int,
    first_pass: bool,
    branch_factor_estimate: float,
) -> torch.Tensor:
    if game.terminal():
        return game.payoffs()

    features = torch.zeros((game.feature_dim(), ), dtype=torch.float)
    game.populate_features(features)

    player_to_act = game.get_player_to_act()
    model = regretModels[player_to_act]
    possible_actions = game.get_one_hot_actions(True)
    num_choices = possible_actions.sum()
    branch_factor_estimate = float((branch_factor_estimate * level) +
                                   (num_choices)) / (level + 1.0)
    metrics.update({"possible_actions_" + str(possible_actions.sum()): 1})
    has_a_choice = num_choices > 1
    if model is None:
        strategy = possible_actions.float()
        active_sampling_chances = None
    else:
        assert strategyModels[player_to_act] is not None
        model_regrets = model.forward_cache(features.unsqueeze(0))[0]
        model_probs = model_regrets.clamp(min=1e-3) * possible_actions.float()
        strategy = model_probs
        active_sampling_chances = (
            strategyModels[player_to_act]  # type: ignore
            .forward(features.unsqueeze(0))[0].clamp(min=1e-3) *
            possible_actions.float())
        active_sampling_chances_sum = float(
            active_sampling_chances.sum().item())

    action_dist = torch.distributions.Categorical(strategy)
    if action_dist.probs.min() < 0 or action_dist.probs.max() == 0:
        print("Invalid action dist:", action_dist.probs)
    if strategy.min() < 0 or strategy.max() == 0:
        print("Invalid strategy:", strategy)

    chance_to_sample = (1.0 if level < 2 else 1.0 -
                        (1.0 / (100.0**(1.0 /
                                        ((level)**branch_factor_estimate)))))
    do_sample = random.random() < chance_to_sample

    if has_a_choice and first_pass:
        strategyData.append((
            features.unsqueeze(0),
            possible_actions.unsqueeze(0),
            action_dist.probs.unsqueeze(0),
        ))

    metrics.update({"visit_level_" + str(level): 1})
    metrics["visit"] += 1
    if metrics["visit"] % 100000 == 0:
        print("Visits", metrics["visit"])

    can_traverse = player_to_train == player_to_act
    if can_traverse and has_a_choice and do_sample:
        # print("PASSED",level,chance_to_sample)
        metrics.update({"sample_level_" + str(level): 1})
        metrics["sample"] += 1
        if metrics["sample_level_" + str(level)] % 10000 == 0:
            print(
                "Samples",
                metrics["sample"],
                metrics["sample_level_" + str(level)],
                level,
                chance_to_sample,
            )

        payoff_for_action = torch.zeros(
            (possible_actions.size()[0], game.num_players), dtype=torch.float)
        chosen_actions = torch.zeros_like(possible_actions)
        enum_actions = list(enumerate(possible_actions))
        random.shuffle(enum_actions)
        num_chosen = 0
        for i, a in enum_actions:
            if a == 0:
                continue
            g = game.clone()
            g.act(player_to_act, i)

            # Active sampling: https://papers.nips.cc/paper/4569-efficient-monte-carlo-counterfactual-regret-minimization-in-games-with-many-player-actions.pdf
            EPSILON = 0.05
            BONUS = 1e-6
            THRESHOLD = 1
            if active_sampling_chances is None:
                # Do Outcome sampling for the first iteration
                as_pass = num_chosen == 0
            else:
                as_pass = random.random() < float(
                    ((BONUS + THRESHOLD * active_sampling_chances[i]) /
                     (BONUS + active_sampling_chances_sum)).item())
            if level == 0:
                # Do external sampling for the game tree root
                as_pass = True
            if True or i == 0 or random.random() < EPSILON or as_pass:
                value = traverse(
                    g,
                    player_to_train,
                    regretModels,
                    playerRegret,
                    strategyModels,
                    strategyData,
                    metrics,
                    level + 1,
                    True if first_pass and num_chosen == 0 else False,
                    branch_factor_estimate,
                )
                payoff_for_action[i] = value
                chosen_actions[i] = 1.0
                num_chosen += 1
        weighted_action_dist = torch.distributions.Categorical(
            action_dist.probs * chosen_actions.float())
        assert payoff_for_action.size()[0] == weighted_action_dist.probs.size(
        )[0]
        expected_utility = payoff_for_action * weighted_action_dist.probs.unsqueeze(
            1)
        assert expected_utility.size() == payoff_for_action.size()
        expected_utility_over_all_actions = expected_utility.sum(dim=0)
        playerRegret.append((
            features.unsqueeze(0),
            chosen_actions.unsqueeze(0),
            (payoff_for_action[:, player_to_act] -
             expected_utility_over_all_actions[player_to_act]).unsqueeze(0),
        ))
        assert expected_utility_over_all_actions.size() == (
            game.num_players, ), str(expected_utility_over_all_actions.size())
        return expected_utility_over_all_actions
    else:
        game.act(player_to_act, int(action_dist.sample().item()))
        return traverse(
            game,
            player_to_train,
            regretModels,
            playerRegret,
            strategyModels,
            strategyData,
            metrics,
            level + int(can_traverse),
            True if first_pass else False,
            branch_factor_estimate,
        )