コード例 #1
0
    def test_braess_paradox(self):
        """Test that Braess paradox can be reproduced with the mean field game."""
        mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", {
            "time_step_length": 0.05,
            "max_num_time_step": 100
        })

        class NashEquilibriumBraess(policy.Policy):
            def action_probabilities(self, state, player_id=None):
                legal_actions = state.legal_actions()
                if not legal_actions:
                    return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0}
                elif len(legal_actions) == 1:
                    return {legal_actions[0]: 1.0}
                else:
                    if legal_actions[0] == 2:
                        return {2: 0.75, 3: 0.25}
                    elif legal_actions[0] == 4:
                        return {4: 2 / 3, 5: 1 / 3}
                raise ValueError(f"{legal_actions} is not correct.")

        ne_policy = NashEquilibriumBraess(mfg_game, 1)
        self.assertEqual(
            -policy_value.PolicyValue(
                mfg_game, distribution.DistributionPolicy(mfg_game, ne_policy),
                ne_policy).value(mfg_game.new_initial_state()), 3.75)
        self.assertEqual(
            nash_conv.NashConv(mfg_game, ne_policy).nash_conv(), 0.0)

        class SocialOptimumBraess(policy.Policy):
            def action_probabilities(self, state, player_id=None):
                legal_actions = state.legal_actions()
                if not legal_actions:
                    return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0}
                elif len(legal_actions) == 1:
                    return {legal_actions[0]: 1.0}
                else:
                    if legal_actions[0] == 2:
                        return {2: 0.5, 3: 0.5}
                    elif legal_actions[0] == 4:
                        return {5: 1.0}
                raise ValueError(f"{legal_actions} is not correct.")

        so_policy = SocialOptimumBraess(mfg_game, 1)
        self.assertEqual(
            -policy_value.PolicyValue(
                mfg_game, distribution.DistributionPolicy(mfg_game, so_policy),
                so_policy).value(mfg_game.new_initial_state()), 3.5)
        self.assertEqual(
            nash_conv.NashConv(mfg_game, so_policy).nash_conv(), 0.75)
コード例 #2
0
 def test_online_mirror_descent(self):
     """Test that online mirror descent can be used on this game."""
     mfg_game = pyspiel.load_game("python_mfg_dynamic_routing")
     omd = mirror_descent.MirrorDescent(mfg_game)
     for _ in range(_NUMBER_OF_ITERATIONS_TESTS):
         omd.iteration()
     nash_conv.NashConv(mfg_game, omd.get_policy())
コード例 #3
0
    def test_python_game(self):
        """Checks if the NashConv is consistent through time."""
        game = crowd_modelling.MFGCrowdModellingGame()
        uniform_policy = policy.UniformRandomPolicy(game)
        nash_conv_fp = nash_conv.NashConv(game, uniform_policy)

        self.assertAlmostEqual(nash_conv_fp.nash_conv(), [3.1700299751054217])
コード例 #4
0
    def test_cpp_game(self):
        """Checks if the NashConv is consistent through time."""
        game = pyspiel.load_game("mfg_crowd_modelling")
        uniform_policy = policy.UniformRandomPolicy(game)
        nash_conv_fp = nash_conv.NashConv(game, uniform_policy)

        self.assertAlmostEqual(nash_conv_fp.nash_conv(), [3.1700299751054217])
コード例 #5
0
    def test_dqn_fp_python_game(self):
        """Checks if fictitious play with DQN-based value function works."""
        game = crowd_modelling.MFGCrowdModellingGame()
        dfp = fictitious_play.FictitiousPlay(game)

        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        envs = [
            rl_environment.Environment(game,
                                       mfg_distribution=dist,
                                       mfg_population=p)
            for p in range(game.num_players())
        ]
        dqn_agent = dqn.DQN(
            0,
            state_representation_size=envs[0].observation_spec()["info_state"]
            [0],
            num_actions=envs[0].action_spec()["num_actions"],
            hidden_layers_sizes=[256, 128, 64],
            replay_buffer_capacity=100,
            batch_size=5,
            epsilon_start=0.02,
            epsilon_end=0.01)

        for _ in range(10):
            dfp.iteration(rl_br_agent=dqn_agent)

        dfp_policy = dfp.get_policy()
        nash_conv_dfp = nash_conv.NashConv(game, dfp_policy)

        self.assertAlmostEqual(nash_conv_dfp.nash_conv(), 1.0558451955622807)
コード例 #6
0
    def test_fp_cpp_game(self):
        """Checks if mirror descent works."""
        game = pyspiel.load_game("mfg_crowd_modelling")
        md = mirror_descent.MirrorDescent(game)
        for _ in range(10):
            md.iteration()
        md_policy = md.get_policy()
        nash_conv_md = nash_conv.NashConv(game, md_policy)

        self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
コード例 #7
0
    def test_fp_python_game(self):
        """Checks if mirror descent works."""
        game = crowd_modelling.MFGCrowdModellingGame()
        md = mirror_descent.MirrorDescent(game)
        for _ in range(10):
            md.iteration()
        md_policy = md.get_policy()
        nash_conv_md = nash_conv.NashConv(game, md_policy)

        self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
コード例 #8
0
  def test_fp(self, name):
    """Checks if mirror descent works."""
    game = pyspiel.load_game(name)
    md = mirror_descent.MirrorDescent(game, value.TabularValueFunction(game))
    for _ in range(10):
      md.iteration()
    md_policy = md.get_policy()
    nash_conv_md = nash_conv.NashConv(game, md_policy)

    self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
コード例 #9
0
    def test_fp_cpp_game(self):
        """Checks if fictitious play works."""
        game = pyspiel.load_game("mfg_crowd_modelling")
        fp = fictitious_play.FictitiousPlay(game)
        for _ in range(10):
            fp.iteration()
        fp_policy = fp.get_policy()
        nash_conv_fp = nash_conv.NashConv(game, fp_policy)

        self.assertAlmostEqual(nash_conv_fp.nash_conv(), 0.9908032626911343)
コード例 #10
0
    def test_fp_python_game(self):
        """Checks if fictitious play works."""
        game = crowd_modelling.MFGCrowdModellingGame()
        fp = fictitious_play.FictitiousPlay(game)
        for _ in range(10):
            fp.iteration()
        fp_policy = fp.get_policy()
        nash_conv_fp = nash_conv.NashConv(game, fp_policy)

        self.assertAlmostEqual(nash_conv_fp.nash_conv(), 0.9908032626911343)
コード例 #11
0
 def test_online_mirror_descent_convergence(self):
     """Test that online mirror descent converges to equilibrium in default game."""
     mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", {
         "time_step_length": 0.05,
         "max_num_time_step": 100
     })
     omd = mirror_descent.MirrorDescent(mfg_game, lr=1)
     for _ in range(50):
         omd.iteration()
     self.assertAlmostEqual(
         nash_conv.NashConv(mfg_game, omd.get_policy()).nash_conv(), 0)
コード例 #12
0
def online_mirror_descent_sioux_falls(mfg_game,
                                      number_of_iterations,
                                      md_p=None):
    nash_conv_dict = {}
    md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game)
    tick_time = time.time()
    for i in range(number_of_iterations):
        md.iteration()
        md_policy = md.get_policy()
        nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
        nash_conv_dict[i] = nash_conv_md.nash_conv()
        print((f"Iteration {i}, Nash conv: {nash_conv_md.nash_conv()}, "
               "time: {time.time() - tick_time}"))
    timing = time.time() - tick_time
    md_policy = md.get_policy()
    distribution_mfg = distribution_module.DistributionPolicy(
        mfg_game, md_policy)
    policy_value_ = policy_value.PolicyValue(mfg_game, distribution_mfg,
                                             md_policy).value(
                                                 mfg_game.new_initial_state())
    nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
    return timing, md_policy, nash_conv_md, policy_value_, md, nash_conv_dict
コード例 #13
0
def mean_field_fictitious_play(mfg_game,
                               number_of_iterations,
                               compute_metrics=False):
    fp = mean_field_fictitious_play_module.FictitiousPlay(mfg_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        fp.iteration()
    timing = time.time() - tick_time
    fp_policy = fp.get_policy()
    # print('learning done')
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, fp_policy)
        # print('distribution done')
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            fp_policy).value(mfg_game.new_initial_state())
        nash_conv_fp = nash_conv_module.NashConv(mfg_game, fp_policy)
        return timing, fp_policy, nash_conv_fp, policy_value_
    return timing, fp_policy
コード例 #14
0
def online_mirror_descent(mfg_game,
                          number_of_iterations,
                          compute_metrics=False,
                          return_policy=False,
                          md_p=None):
    md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        md.iteration()
    timing = time.time() - tick_time
    md_policy = md.get_policy()
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, md_policy)
        # print('distribution done')
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            md_policy).value(mfg_game.new_initial_state())
        nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
        if return_policy:
            return timing, md_policy, nash_conv_md, policy_value_, md
        return timing, md_policy, nash_conv_md, policy_value_
    return timing, md_policy
コード例 #15
0
def main(argv: Sequence[str]) -> None:
  # TODO(perolat): move to an example directory.
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')
  mfg_game = pyspiel.load_game(FLAGS.game, GAME_SETTINGS.get(FLAGS.game, {}))
  mfg_state = mfg_game.new_initial_state()
  print('Playing a single arbitrary trajectory')
  while not mfg_state.is_terminal():
    print('State obs string:', mfg_state.observation_string(0))
    if mfg_state.current_player() == pyspiel.PlayerId.CHANCE:
      action_list, prob_list = zip(*mfg_state.chance_outcomes())
      action = np.random.choice(action_list, p=prob_list)
      mfg_state.apply_action(action)
    elif mfg_state.current_player() == pyspiel.PlayerId.MEAN_FIELD:
      dist_to_register = mfg_state.distribution_support()
      n_states = len(dist_to_register)
      dist = [1.0 / n_states for _ in range(n_states)]
      mfg_state.update_distribution(dist)
    else:
      legal_list = mfg_state.legal_actions()
      action = np.random.choice(legal_list)
      mfg_state.apply_action(action)

  print('compute nashconv')
  uniform_policy = policy.UniformRandomPolicy(mfg_game)
  nash_conv_fp = nash_conv.NashConv(mfg_game, uniform_policy)
  print('Nashconv:', nash_conv_fp.nash_conv())

  print('compute distribution')
  mfg_dist = distribution.DistributionPolicy(mfg_game, uniform_policy)
  br_value = best_response_value.BestResponse(
      mfg_game, mfg_dist, value.TabularValueFunction(mfg_game))
  py_value = policy_value.PolicyValue(mfg_game, mfg_dist, uniform_policy,
                                      value.TabularValueFunction(mfg_game))
  print(
      'Value of a best response policy to a uniform policy '
      '(computed with best_response_value)',
      br_value(mfg_game.new_initial_state()))
  print('Value of the uniform policy:', py_value(mfg_game.new_initial_state()))
  greedy_pi = greedy_policy.GreedyPolicy(mfg_game, None, br_value)
  greedy_pi = greedy_pi.to_tabular()
  pybr_value = policy_value.PolicyValue(mfg_game, mfg_dist, greedy_pi,
                                        value.TabularValueFunction(mfg_game))
  print(
      'Value of a best response policy to a uniform policy (computed at the '
      'value of the greedy policy of the best response value)',
      pybr_value(mfg_game.new_initial_state()))
  print('merge')
  merged_pi = fictitious_play.MergedPolicy(
      mfg_game, list(range(mfg_game.num_players())),
      [uniform_policy, greedy_pi],
      [mfg_dist, distribution.DistributionPolicy(mfg_game, greedy_pi)],
      [0.5, 0.5])

  merged_pi_value = policy_value.PolicyValue(
      mfg_game, mfg_dist, merged_pi, value.TabularValueFunction(mfg_game))
  print(br_value(mfg_game.new_initial_state()))
  print(py_value(mfg_game.new_initial_state()))
  print(merged_pi_value(mfg_game.new_initial_state()))
  print((br_value(mfg_game.new_initial_state()) +
         py_value(mfg_game.new_initial_state())) / 2)
  print('fp')
  fp = fictitious_play.FictitiousPlay(mfg_game)
  for j in range(100):
    print('Iteration', j, 'of fictitious play')
    fp.iteration()
    fp_policy = fp.get_policy()
    nash_conv_fp = nash_conv.NashConv(mfg_game, fp_policy)
    print('Nashconv of the current FP policy', nash_conv_fp.nash_conv())
  print('md')
  md = mirror_descent.MirrorDescent(mfg_game,
                                    value.TabularValueFunction(mfg_game))
  for j in range(10):
    print('Iteration', j, 'of mirror descent')
    md.iteration()
    md_policy = md.get_policy()
    nash_conv_md = nash_conv.NashConv(mfg_game, md_policy)
    print('Nashconv of the current MD policy', nash_conv_md.nash_conv())
コード例 #16
0
ファイル: mfg_dqn_jax.py プロジェクト: sarahperrin/open_spiel
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name,
                             GAME_SETTINGS.get(FLAGS.game_name, {}))
    uniform_policy = policy.UniformRandomPolicy(game)
    mfg_dist = distribution.DistributionPolicy(game, uniform_policy)

    envs = [
        rl_environment.Environment(game,
                                   mfg_distribution=mfg_dist,
                                   mfg_population=p)
        for p in range(game.num_players())
    ]
    info_state_size = envs[0].observation_spec()["info_state"][0]
    num_actions = envs[0].action_spec()["num_actions"]

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "learning_rate": FLAGS.rl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": FLAGS.epsilon_start,
        "epsilon_end": FLAGS.epsilon_end,
    }

    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(idx, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs) for idx in range(game.num_players())
    ]
    joint_avg_policy = rl_agent_policy.JointRLAgentPolicy(
        game, {idx: agent
               for idx, agent in enumerate(agents)}, envs[0].use_observation)
    if FLAGS.use_checkpoints:
        for agent in agents:
            if agent.has_checkpoint(FLAGS.checkpoint_dir):
                agent.restore(FLAGS.checkpoint_dir)

    # Metrics writer will also log the metrics to stderr.
    just_logging = FLAGS.logdir is None or jax.host_id() > 0
    writer = metric_writers.create_default_writer(FLAGS.logdir,
                                                  just_logging=just_logging)

    # Save the parameters.
    writer.write_hparams(kwargs)

    for ep in range(1, FLAGS.num_train_episodes + 1):
        if ep % FLAGS.eval_every == 0:
            writer.write_scalars(
                ep, {
                    f"agent{i}/loss": float(agent.loss)
                    for i, agent in enumerate(agents)
                })

            initial_states = game.new_initial_states()

            # Exact best response to uniform.
            nash_conv_obj = nash_conv.NashConv(game, uniform_policy)
            writer.write_scalars(
                ep, {
                    f"exact_br/{state}": value
                    for state, value in zip(initial_states,
                                            nash_conv_obj.br_values())
                })

            # DQN best response to uniform.
            pi_value = policy_value.PolicyValue(game, mfg_dist,
                                                joint_avg_policy)
            writer.write_scalars(
                ep, {
                    f"dqn_br/{state}": pi_value.eval_state(state)
                    for state in initial_states
                })

            if FLAGS.use_checkpoints:
                for agent in agents:
                    agent.save(FLAGS.checkpoint_dir)

        for p in range(game.num_players()):
            time_step = envs[p].reset()
            while not time_step.last():
                agent_output = agents[p].step(time_step)
                action_list = [agent_output.action]
                time_step = envs[p].step(action_list)

            # Episode is over, step all agents with final info state.
            agents[p].step(time_step)

    # Make sure all values were written.
    writer.flush()
コード例 #17
0
ファイル: mfg_dqn_jax.py プロジェクト: dmorrill10/open_spiel
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name,
                             GAME_SETTINGS.get(FLAGS.game_name, {}))
    uniform_policy = policy.UniformRandomPolicy(game)
    mfg_dist = distribution.DistributionPolicy(game, uniform_policy)

    envs = [
        rl_environment.Environment(game,
                                   distribution=mfg_dist,
                                   mfg_population=p)
        for p in range(game.num_players())
    ]
    info_state_size = envs[0].observation_spec()["info_state"][0]
    num_actions = envs[0].action_spec()["num_actions"]

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "learning_rate": FLAGS.rl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": FLAGS.epsilon_start,
        "epsilon_end": FLAGS.epsilon_end,
    }

    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(idx, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs) for idx in range(game.num_players())
    ]
    joint_avg_policy = DQNPolicies(envs, agents)
    if FLAGS.use_checkpoints:
        for agent in agents:
            if agent.has_checkpoint(FLAGS.checkpoint_dir):
                agent.restore(FLAGS.checkpoint_dir)

    for ep in range(FLAGS.num_train_episodes):
        if (ep + 1) % FLAGS.eval_every == 0:
            losses = [agent.loss for agent in agents]
            logging.info("Losses: %s", losses)
            nash_conv_obj = nash_conv.NashConv(game, uniform_policy)
            print(
                str(ep + 1) + " Exact Best Response to Uniform " +
                str(nash_conv_obj.br_values()))
            pi_value = policy_value.PolicyValue(game, mfg_dist,
                                                joint_avg_policy)
            print(
                str(ep + 1) + " DQN Best Response to Uniform " + str([
                    pi_value.eval_state(state)
                    for state in game.new_initial_states()
                ]))
            if FLAGS.use_checkpoints:
                for agent in agents:
                    agent.save(FLAGS.checkpoint_dir)
            logging.info("_____________________________________________")

        for p in range(game.num_players()):
            time_step = envs[p].reset()
            while not time_step.last():
                agent_output = agents[p].step(time_step)
                action_list = [agent_output.action]
                time_step = envs[p].step(action_list)

            # Episode is over, step all agents with final info state.
            agents[p].step(time_step)
コード例 #18
0
def main(argv: Sequence[str]) -> None:
    # TODO(perolat): move to an example directory.
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    game_settings = {
        'only_distribution_reward': True,
        'forbidden_states': '[0|0;0|1]',
        'initial_distribution': '[0|2;0|3]',
        'initial_distribution_value': '[0.5;0.5]',
    }
    mfg_game = pyspiel.load_game(FLAGS.game, game_settings)
    mfg_state = mfg_game.new_initial_state()
    while not mfg_state.is_terminal():
        print(mfg_state.observation_string(0))
        if mfg_state.current_player() == pyspiel.PlayerId.CHANCE:
            action_list, prob_list = zip(*mfg_state.chance_outcomes())
            action = np.random.choice(action_list, p=prob_list)
            mfg_state.apply_action(action)
        elif mfg_state.current_player() == pyspiel.PlayerId.MEAN_FIELD:
            dist_to_register = mfg_state.distribution_support()
            n_states = len(dist_to_register)
            dist = [1.0 / n_states for _ in range(n_states)]
            mfg_state.update_distribution(dist)
        else:
            legal_list = mfg_state.legal_actions()
            action = np.random.choice(legal_list)
            mfg_state.apply_action(action)

    print('compute nashconv')
    uniform_policy = policy.UniformRandomPolicy(mfg_game)
    nash_conv_fp = nash_conv.NashConv(mfg_game, uniform_policy)
    print(nash_conv_fp.nash_conv())

    print('compute distribution')
    mfg_dist = distribution.DistributionPolicy(mfg_game, uniform_policy)
    br_value = best_response_value.BestResponse(mfg_game, mfg_dist)
    py_value = policy_value.PolicyValue(mfg_game, mfg_dist, uniform_policy)
    print(br_value(mfg_game.new_initial_state()))
    print(py_value(mfg_game.new_initial_state()))
    greedy_pi = greedy_policy.GreedyPolicy(mfg_game, None, br_value)
    greedy_pi = greedy_pi.to_tabular()
    pybr_value = policy_value.PolicyValue(mfg_game, mfg_dist, greedy_pi)
    print(pybr_value(mfg_game.new_initial_state()))
    print('merge')
    merged_pi = fictitious_play.MergedPolicy(
        mfg_game, list(range(mfg_game.num_players())),
        [uniform_policy, greedy_pi],
        [mfg_dist,
         distribution.DistributionPolicy(mfg_game, greedy_pi)], [0.5, 0.5])

    merged_pi_value = policy_value.PolicyValue(mfg_game, mfg_dist, merged_pi)
    print(br_value(mfg_game.new_initial_state()))
    print(py_value(mfg_game.new_initial_state()))
    print(merged_pi_value(mfg_game.new_initial_state()))
    print((br_value(mfg_game.new_initial_state()) +
           py_value(mfg_game.new_initial_state())) / 2)
    print('fp')
    fp = fictitious_play.FictitiousPlay(mfg_game)
    for j in range(100):
        print(j)
        fp.iteration()
        fp_policy = fp.get_policy()
        nash_conv_fp = nash_conv.NashConv(mfg_game, fp_policy)
        print(nash_conv_fp.nash_conv())