コード例 #1
0
  def test_rcfr_with_buffer(self):
    buffer_size = 12
    num_epochs = 100
    num_iterations = 2
    models = [_new_model() for _ in range(_GAME.num_players())]

    patient = rcfr.ReservoirRcfrSolver(_GAME, models, buffer_size=buffer_size)

    def _train(model, data):
      data = torch.utils.data.DataLoader(
          data, batch_size=_BATCH_SIZE, shuffle=True)

      loss_fn = nn.SmoothL1Loss()
      optimizer = torch.optim.Adam(model.parameters(), lr=0.005, amsgrad=True)
      for _ in range(num_epochs):
        for x, y in data:
          optimizer.zero_grad()
          output = model(x)
          loss = loss_fn(output, y)
          loss.backward()
          optimizer.step()

    average_policy = patient.average_policy()
    self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91)

    for _ in range(num_iterations):
      patient.evaluate_and_update_policy(_train)

    average_policy = patient.average_policy()
    self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
コード例 #2
0
  def test_cfr(self):
    root = rcfr.RootStateWrapper(_GAME.new_initial_state())
    num_half_iterations = 6

    cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences]
    cumulative_reach_weights = [np.zeros(n) for n in root.num_player_sequences]

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    # parameterized.TestCase
    self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91)

    regret_player = 0
    for _ in range(num_half_iterations):
      reach_weights_player = 1 if regret_player == 0 else 0

      regrets, reach = root.counterfactual_regrets_and_reach_weights(
          regret_player, reach_weights_player, *rcfr.relu(cumulative_regrets))

      cumulative_regrets[regret_player] += regrets
      cumulative_reach_weights[reach_weights_player] += reach

      regret_player = reach_weights_player

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.27)
コード例 #3
0
  def test_rcfr_with_buffer(self):
    buffer_size = 12
    num_epochs = 100
    num_iterations = 2
    models = [_new_model() for _ in range(_GAME.num_players())]

    patient = rcfr.ReservoirRcfrSolver(_GAME, models, buffer_size=buffer_size)

    def _train(model, data):
      data = data.shuffle(12)
      data = data.batch(12)
      data = data.repeat(num_epochs)

      optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True)

      for x, y in data:
        optimizer.minimize(
            lambda: tf.losses.huber_loss(y, model(x)),  # pylint: disable=cell-var-from-loop
            model.trainable_variables)

    average_policy = patient.average_policy()
    self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91)

    for _ in range(num_iterations):
      patient.evaluate_and_update_policy(_train)

    average_policy = patient.average_policy()
    self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
コード例 #4
0
ファイル: rcfr_test.py プロジェクト: parton69/open_spiel
    def test_rcfr_functions(self):
        models = [_new_model() for _ in range(_GAME.num_players())]
        root = rcfr.RootStateWrapper(_GAME.new_initial_state())

        num_half_iterations = 4
        num_epochs = 100

        cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences]
        cumulative_reach_weights = [
            np.zeros(n) for n in root.num_player_sequences
        ]

        average_profile = root.sequence_weights_to_tabular_profile(
            cumulative_reach_weights)
        self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91)

        regret_player = 0
        sequence_weights = [
            model(root.sequence_features[player]).numpy()
            for player, model in enumerate(models)
        ]

        for _ in range(num_half_iterations):
            reach_weights_player = 1 if regret_player == 0 else 0

            sequence_weights[reach_weights_player] = models[
                reach_weights_player](
                    root.sequence_features[reach_weights_player]).numpy()

            regrets, seq_probs = root.counterfactual_regrets_and_reach_weights(
                regret_player, reach_weights_player, *sequence_weights)

            cumulative_regrets[regret_player] += regrets
            cumulative_reach_weights[reach_weights_player] += seq_probs

            data = tf.data.Dataset.from_tensor_slices(
                (root.sequence_features[regret_player],
                 tf.expand_dims(cumulative_regrets[regret_player], axis=1)))
            data = data.shuffle(12)
            data = data.batch(12)
            data = data.repeat(num_epochs)

            optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True)

            for x, y in data:
                optimizer.minimize(
                    lambda: tf.compat.v1.losses.huber_loss(
                        y, models[regret_player](x)),  # pylint: disable=cell-var-from-loop
                    models[regret_player].trainable_variables)

            regret_player = reach_weights_player

        average_profile = root.sequence_weights_to_tabular_profile(
            cumulative_reach_weights)

        self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.91)
コード例 #5
0
  def test_rcfr_functions(self):
    models = [_new_model() for _ in range(_GAME.num_players())]
    root = rcfr.RootStateWrapper(_GAME.new_initial_state())

    num_half_iterations = 4
    num_epochs = 100

    cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences]
    cumulative_reach_weights = [np.zeros(n) for n in root.num_player_sequences]

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91)

    regret_player = 0
    sequence_weights = [
        model(root.sequence_features[player]).detach().numpy()
        for player, model in enumerate(models)
    ]

    for _ in range(num_half_iterations):
      reach_weights_player = 1 if regret_player == 0 else 0

      sequence_weights[reach_weights_player] = models[reach_weights_player](
          root.sequence_features[reach_weights_player]).detach().numpy()

      regrets, seq_probs = root.counterfactual_regrets_and_reach_weights(
          regret_player, reach_weights_player, *sequence_weights)

      cumulative_regrets[regret_player] += regrets
      cumulative_reach_weights[reach_weights_player] += seq_probs

      data = torch.utils.data.TensorDataset(
          root.sequence_features[regret_player],
          torch.unsqueeze(
              torch.Tensor(cumulative_regrets[regret_player]), axis=1))
      data = torch.utils.data.DataLoader(
          data, batch_size=_BATCH_SIZE, shuffle=True)

      loss_fn = nn.SmoothL1Loss()
      optimizer = torch.optim.Adam(
          models[regret_player].parameters(), lr=0.005, amsgrad=True)
      for _ in range(num_epochs):
        for x, y in data:
          optimizer.zero_grad()
          output = models[regret_player](x)
          loss = loss_fn(output, y)
          loss.backward()
          optimizer.step()

      regret_player = reach_weights_player

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.91)
コード例 #6
0
    def test_python_same_as_cpp_for_multiplayer_uniform_random_nash_conv(
            self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})

        # TabularPolicy defaults to being a uniform random policy.
        test_policy = policy.TabularPolicy(game)
        python_nash_conv = exploitability.nash_conv(game, test_policy)
        cpp_nash_conv = pyspiel.nash_conv(
            game, policy_utils.policy_to_dict(test_policy, game))
        self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
コード例 #7
0
 def test_cpp_python_cfr_kuhn(self):
     game = pyspiel.load_game("kuhn_poker")
     solver = pyspiel.CFRSolver(game)
     for _ in range(100):
         solver.evaluate_and_update_policy()
     pyspiel_average_policy = solver.tabular_average_policy()
     cpp_nash_conv = pyspiel.nash_conv(game, pyspiel_average_policy)
     python_policy = policy.pyspiel_policy_to_python_policy(
         game, pyspiel_average_policy)
     python_nash_conv = exploitability.nash_conv(game, python_policy)
     self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
コード例 #8
0
    def test_neurd(self):
        num_iterations = 2
        models = [_new_model() for _ in range(_GAME.num_players())]

        solver = neurd.CounterfactualNeurdSolver(_GAME, models)

        average_policy = solver.average_policy()
        self.assertGreater(pyspiel.nash_conv(_GAME, average_policy), 0.91)

        @tf.function
        def _train(model, data):
            neurd.train(model=model,
                        data=data,
                        batch_size=12,
                        step_size=10.0,
                        autoencoder_loss=tf.losses.huber_loss)

        for _ in range(num_iterations):
            solver.evaluate_and_update_policy(_train)

        average_policy = solver.average_policy()
        self.assertLess(pyspiel.nash_conv(_GAME, average_policy), 0.91)
コード例 #9
0
  def test_cpp_and_python_cfr_br(self, game, solver_cls,
                                 expected_exploitability):
    solver = solver_cls(game)
    for step in range(5):
      solver.evaluate_and_update_policy()

      # We do not compare the policy directly as we do not have an easy way to
      # convert one to the other, so we use the exploitability as a proxy.
      avg_policy = solver.average_policy()
      if solver_cls == pyspiel.CFRBRSolver:
        exploitability_ = pyspiel.nash_conv(game, avg_policy)
      else:
        exploitability_ = exploitability.nash_conv(game, avg_policy)

      self.assertEqual(expected_exploitability[step], exploitability_)
コード例 #10
0
    def test_cpp_algorithms_identical_to_python_algorithm(
            self, game, cpp_class, python_class):
        cpp_solver = cpp_class(game)
        python_solver = python_class(game)

        for _ in range(5):
            cpp_solver.evaluate_and_update_policy()
            python_solver.evaluate_and_update_policy()

            cpp_avg_policy = cpp_solver.average_policy()
            python_avg_policy = python_solver.average_policy()

            # We do not compare the policy directly as we do not have an easy way to
            # convert one to the other, so we use the exploitability as a proxy.
            cpp_expl = pyspiel.nash_conv(game, cpp_avg_policy)
            python_expl = exploitability.nash_conv(game, python_avg_policy)
            self.assertEqual(cpp_expl, python_expl)
        # Then we also check the CurrentPolicy, just to check it is giving the same
        # results too
        cpp_current_policy = cpp_solver.current_policy()
        python_current_policy = python_solver.current_policy()
        cpp_expl = pyspiel.nash_conv(game, cpp_current_policy)
        python_expl = exploitability.nash_conv(game, python_current_policy)
        self.assertEqual(cpp_expl, python_expl)
コード例 #11
0
def run_iterations(game, solver, start_iteration=0):
    """Run iterations of MCCFR."""
    for i in range(int(FLAGS.iterations / 2)):
        solver.run_iteration()
        policy = solver.average_policy()
        exploitability = pyspiel.exploitability(game, policy)

        # We also compute NashConv to highlight an important API feature:
        # when using Monte Carlo sampling, the policy
        # may not have a table entry for every info state.
        # Therefore, when calling nash_conv, ensure the third argument,
        # "use_state_get_policy" is set to True
        # See https://github.com/deepmind/open_spiel/issues/500
        nash_conv = pyspiel.nash_conv(game, policy, True)

        print("Iteration {} nashconv: {:.6f} exploitability: {:.6f}".format(
            start_iteration + i, nash_conv, exploitability))
コード例 #12
0
 def test_matching_pennies_3p(self):
     game = pyspiel.load_game_as_turn_based('matching_pennies_3p')
     deep_cfr_solver = deep_cfr.DeepCFRSolver(game,
                                              policy_network_layers=(16, 8),
                                              advantage_network_layers=(32,
                                                                        16),
                                              num_iterations=2,
                                              num_traversals=2,
                                              learning_rate=1e-3,
                                              batch_size_advantage=None,
                                              batch_size_strategy=None,
                                              memory_capacity=1e7)
     deep_cfr_solver.solve()
     conv = pyspiel.nash_conv(
         game,
         policy.python_policy_to_pyspiel_policy(
             policy.tabular_policy_from_callable(
                 game, deep_cfr_solver.action_probabilities)))
     logging.info('Deep CFR in Matching Pennies 3p. NashConv: %.2f', conv)
コード例 #13
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)

    deep_cfr_solver = deep_cfr.DeepCFRSolver(
        game,
        policy_network_layers=(32, 32),
        advantage_network_layers=(16, 16),
        num_iterations=FLAGS.num_iterations,
        num_traversals=FLAGS.num_traversals,
        learning_rate=1e-3,
        batch_size_advantage=None,
        batch_size_strategy=None,
        memory_capacity=int(1e7))

    _, advantage_losses, policy_loss = deep_cfr_solver.solve()
    for player, losses in six.iteritems(advantage_losses):
        logging.info("Advantage for player %d: %s", player,
                     losses[:2] + ["..."] + losses[-2:])
        logging.info("Advantage Buffer Size for player %s: '%s'", player,
                     len(deep_cfr_solver.advantage_buffers[player]))
    logging.info("Strategy Buffer Size: '%s'",
                 len(deep_cfr_solver.strategy_buffer))
    logging.info("Final policy loss: '%s'", policy_loss)

    average_policy = policy.tabular_policy_from_callable(
        game, deep_cfr_solver.action_probabilities)
    pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy)
    conv = pyspiel.nash_conv(game, pyspiel_policy)
    logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)

    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    logging.info("Computed player 0 value: %.2f (expected: %.2f).",
                 average_policy_values[0], -1 / 18)
    logging.info("Computed player 1 value: %.2f (expected: %.2f).",
                 average_policy_values[1], 1 / 18)