Ejemplo n.º 1
0
  def iteration(self, rl_br_agent=None, learning_rate=None):
    """Returns a new `TabularPolicy` equivalent to this policy.

    Args:
      rl_br_agent: An instance of the RL approximation method to use to compute
        the best response value for each iteration. If none provided, the exact
        value is computed.
      learning_rate: The learning rate.
    """
    self._fp_step += 1

    distrib = distribution.DistributionPolicy(self._game, self._policy)

    if rl_br_agent:
      joint_avg_policy = rl_agent_policy.RLAgentPolicy(
          self._game, rl_br_agent, rl_br_agent.player_id, use_observation=True)
      br_value = policy_value.PolicyValue(self._game, distrib, joint_avg_policy)
    else:
      br_value = best_response_value.BestResponse(
          self._game, distrib, value.TabularValueFunction(self._game))

    greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value)
    greedy_pi = greedy_pi.to_tabular(states=self._states)
    distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi)

    weight = learning_rate if learning_rate else 1.0 / (self._fp_step + 1)

    self._policy = MergedPolicy(
        self._game, list(range(self._game.num_players())),
        [self._policy, greedy_pi], [distrib, distrib_greedy],
        [1.0 - weight, weight]).to_tabular(states=self._states)
Ejemplo n.º 2
0
  def test_rl_environment(self, game_name):
    """Check that the RL environment runs for a few trajectories."""
    game = pyspiel.load_game(game_name)
    uniform_policy = policy.UniformRandomPolicy(game)
    mfg_dist = distribution.DistributionPolicy(game, uniform_policy)

    envs = [
        rl_environment.Environment(
            game, mfg_distribution=mfg_dist, mfg_population=p)
        for p in range(game.num_players())
    ]
    for p, env in enumerate(envs):
      for _ in range(FLAGS.rl_env_simulations):
        time_step = env.reset()
        while not time_step.last():
          a = random.choice(time_step.observations['legal_actions'][p])
          time_step = env.step([a])

    env = envs[0]
    self.assertEqual(env.mfg_distribution, mfg_dist)

    # Update the distribution.
    new_mfg_dist = distribution.DistributionPolicy(game, uniform_policy)
    env.update_mfg_distribution(new_mfg_dist)
    self.assertEqual(env.mfg_distribution, new_mfg_dist)
Ejemplo n.º 3
0
    def test_braess_paradox(self):
        """Test that Braess paradox can be reproduced with the mean field game."""
        mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", {
            "time_step_length": 0.05,
            "max_num_time_step": 100
        })

        class NashEquilibriumBraess(policy.Policy):
            def action_probabilities(self, state, player_id=None):
                legal_actions = state.legal_actions()
                if not legal_actions:
                    return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0}
                elif len(legal_actions) == 1:
                    return {legal_actions[0]: 1.0}
                else:
                    if legal_actions[0] == 2:
                        return {2: 0.75, 3: 0.25}
                    elif legal_actions[0] == 4:
                        return {4: 2 / 3, 5: 1 / 3}
                raise ValueError(f"{legal_actions} is not correct.")

        ne_policy = NashEquilibriumBraess(mfg_game, 1)
        self.assertEqual(
            -policy_value.PolicyValue(
                mfg_game, distribution.DistributionPolicy(mfg_game, ne_policy),
                ne_policy).value(mfg_game.new_initial_state()), 3.75)
        self.assertEqual(
            nash_conv.NashConv(mfg_game, ne_policy).nash_conv(), 0.0)

        class SocialOptimumBraess(policy.Policy):
            def action_probabilities(self, state, player_id=None):
                legal_actions = state.legal_actions()
                if not legal_actions:
                    return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0}
                elif len(legal_actions) == 1:
                    return {legal_actions[0]: 1.0}
                else:
                    if legal_actions[0] == 2:
                        return {2: 0.5, 3: 0.5}
                    elif legal_actions[0] == 4:
                        return {5: 1.0}
                raise ValueError(f"{legal_actions} is not correct.")

        so_policy = SocialOptimumBraess(mfg_game, 1)
        self.assertEqual(
            -policy_value.PolicyValue(
                mfg_game, distribution.DistributionPolicy(mfg_game, so_policy),
                so_policy).value(mfg_game.new_initial_state()), 3.5)
        self.assertEqual(
            nash_conv.NashConv(mfg_game, so_policy).nash_conv(), 0.75)
Ejemplo n.º 4
0
    def test_dqn_fp_python_game(self):
        """Checks if fictitious play with DQN-based value function works."""
        game = crowd_modelling.MFGCrowdModellingGame()
        dfp = fictitious_play.FictitiousPlay(game)

        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        envs = [
            rl_environment.Environment(game,
                                       mfg_distribution=dist,
                                       mfg_population=p)
            for p in range(game.num_players())
        ]
        dqn_agent = dqn.DQN(
            0,
            state_representation_size=envs[0].observation_spec()["info_state"]
            [0],
            num_actions=envs[0].action_spec()["num_actions"],
            hidden_layers_sizes=[256, 128, 64],
            replay_buffer_capacity=100,
            batch_size=5,
            epsilon_start=0.02,
            epsilon_end=0.01)

        for _ in range(10):
            dfp.iteration(rl_br_agent=dqn_agent)

        dfp_policy = dfp.get_policy()
        nash_conv_dfp = nash_conv.NashConv(game, dfp_policy)

        self.assertAlmostEqual(nash_conv_dfp.nash_conv(), 1.0558451955622807)
Ejemplo n.º 5
0
    def __init__(self, game, policy: policy_std.Policy, root_state=None):
        """Initializes the nash conv.

    Args:
      game: The game to analyze.
      policy: A `policy.Policy` object.
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
        self._game = game
        self._policy = policy
        if root_state is None:
            self._root_states = game.new_initial_states()
        else:
            self._root_states = [root_state]
        self._distrib = distribution.DistributionPolicy(self._game,
                                                        self._policy,
                                                        root_state=root_state)
        self._pi_value = policy_value.PolicyValue(self._game,
                                                  self._distrib,
                                                  self._policy,
                                                  value.TabularValueFunction(
                                                      self._game),
                                                  root_state=root_state)
        self._br_value = best_response_value.BestResponse(
            self._game,
            self._distrib,
            value.TabularValueFunction(self._game),
            root_state=root_state)
Ejemplo n.º 6
0
 def nash_conv(self):
     """Returns the nash conv."""
     distrib = distribution.DistributionPolicy(self._game, self._policy)
     pi_value = policy_value.PolicyValue(self._game, distrib, self._policy)
     br_value = best_response_value.BestResponse(self._game, distrib)
     return (br_value.eval_state(self._game.new_initial_state()) -
             pi_value.eval_state(self._game.new_initial_state()))
Ejemplo n.º 7
0
    def __init__(self,
                 game,
                 state_value: Optional[value.ValueFunction] = None,
                 lr=0.01,
                 root_state=None):
        """Initializes mirror descent.

    Args:
      game: The game,
      state_value: A state value function. Default to TabularValueFunction.
      lr: The learning rate of mirror descent,
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
        self._game = game
        if root_state is None:
            self._root_states = game.new_initial_states()
        else:
            self._root_states = [root_state]
        self._policy = policy_std.UniformRandomPolicy(game)
        self._distribution = distribution.DistributionPolicy(
            game, self._policy)
        self._md_step = 0
        self._lr = lr

        self._state_value = (state_value if state_value else
                             value.TabularValueFunction(game))
        self._cumulative_state_value = value.TabularValueFunction(game)
Ejemplo n.º 8
0
  def iteration(self):
    """Returns a new `TabularPolicy` equivalent to this policy."""
    self._fp_step += 1

    distrib = distribution.DistributionPolicy(self._game, self._policy)
    br_value = best_response_value.BestResponse(self._game, distrib)

    greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value)
    greedy_pi = greedy_pi.to_tabular()
    distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi)

    self._policy = MergedPolicy(
        self._game, list(range(self._game.num_players())),
        [self._policy, greedy_pi], [distrib, distrib_greedy],
        [1.0*self._fp_step/(self._fp_step+1), 1.0/(self._fp_step+1)]
        ).to_tabular()
Ejemplo n.º 9
0
 def test_state_support_outside_distrib(self):
     game = pyspiel.load_game(
         "mfg_crowd_modelling_2d", {
             "initial_distribution": "[0|0]",
             "initial_distribution_value": "[1.]",
         })
     uniform_policy = policy.UniformRandomPolicy(game)
     _ = distribution.DistributionPolicy(game, uniform_policy)
Ejemplo n.º 10
0
 def test_cpp_game(self):
     """Checks if the value of a policy computation works."""
     game = pyspiel.load_game("mfg_crowd_modelling")
     uniform_policy = policy.UniformRandomPolicy(game)
     dist = distribution.DistributionPolicy(game, uniform_policy)
     py_value = policy_value.PolicyValue(game, dist, uniform_policy)
     py_val = py_value(game.new_initial_state())
     self.assertAlmostEqual(py_val, 29.92843602293449)
Ejemplo n.º 11
0
 def test_python_game(self):
     """Checks if the value of a policy computation works."""
     game = crowd_modelling.MFGCrowdModellingGame()
     uniform_policy = policy.UniformRandomPolicy(game)
     dist = distribution.DistributionPolicy(game, uniform_policy)
     br_value = best_response_value.BestResponse(game, dist)
     br_val = br_value(game.new_initial_state())
     self.assertAlmostEqual(br_val, 30.029387484327486)
Ejemplo n.º 12
0
 def test_multi_pop(self):
     game = pyspiel.load_game("python_mfg_predator_prey")
     self.assertEqual(game.num_players(), 3)
     uniform_policy = policy.UniformRandomPolicy(game)
     dist = distribution.DistributionPolicy(game, uniform_policy)
     for pop in range(3):
         self.assertAlmostEqual(
             dist.value(game.new_initial_state_for_population(pop)), 1.)
 def test_cpp_game(self):
     """Checks if the value of a policy computation works."""
     game = pyspiel.load_game("mfg_crowd_modelling")
     uniform_policy = policy.UniformRandomPolicy(game)
     dist = distribution.DistributionPolicy(game, uniform_policy)
     br_value = best_response_value.BestResponse(game, dist)
     br_val = br_value(game.new_initial_state())
     self.assertAlmostEqual(br_val, 33.09846599803991)
Ejemplo n.º 14
0
 def test_python_game(self):
   """Checks if the value of a policy computation works."""
   game = crowd_modelling.MFGCrowdModellingGame()
   uniform_policy = policy.UniformRandomPolicy(game)
   dist = distribution.DistributionPolicy(game, uniform_policy)
   py_value = policy_value.PolicyValue(game, dist, uniform_policy)
   py_val = py_value(game.new_initial_state())
   self.assertAlmostEqual(py_val, 27.215850929940448)
Ejemplo n.º 15
0
 def test_best_response(self, name):
   """Checks if the value of a policy computation works."""
   game = pyspiel.load_game(name)
   uniform_policy = policy.UniformRandomPolicy(game)
   dist = distribution.DistributionPolicy(game, uniform_policy)
   br_value = best_response_value.BestResponse(
       game, dist, value.TabularValueFunction(game))
   br_val = br_value(game.new_initial_state())
   self.assertAlmostEqual(br_val, 30.029387484327486)
Ejemplo n.º 16
0
 def iteration(self):
   """an iteration of Mirror Descent."""
   self._md_step += 1
   self._state_value = collections.defaultdict(float)
   for state in self._root_states:
     self.eval_state(state)
   self._policy = ProjectedPolicy(self._game,
                                  list(range(self._game.num_players())),
                                  self._cumulative_state_value)
   self._distribution = distribution.DistributionPolicy(
       self._game, self._policy)
Ejemplo n.º 17
0
    def test_average(self):
        """Test the average of policies.

    Here we test that the average of values is the value of the average policy.
    """
        game = crowd_modelling.MFGCrowdModellingGame()
        uniform_policy = policy.UniformRandomPolicy(game)
        mfg_dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(game, mfg_dist)
        py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy)
        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        merged_pi = fictitious_play.MergedPolicy(
            game, list(range(game.num_players())), [uniform_policy, greedy_pi],
            [mfg_dist,
             distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5])
        merged_pi_value = policy_value.PolicyValue(game, mfg_dist, merged_pi)

        self.assertAlmostEqual(merged_pi_value(game.new_initial_state()),
                               (br_value(game.new_initial_state()) +
                                py_value(game.new_initial_state())) / 2)
Ejemplo n.º 18
0
 def iteration(self, learning_rate=None):
     """an iteration of Mirror Descent."""
     self._md_step += 1
     # TODO(sertan): Fix me.
     self._state_value = value.TabularValueFunction(self._game)
     for state in self._root_states:
         self.eval_state(state,
                         learning_rate if learning_rate else self._lr)
     self._policy = ProjectedPolicy(self._game,
                                    list(range(self._game.num_players())),
                                    self._cumulative_state_value)
     self._distribution = distribution.DistributionPolicy(
         self._game, self._policy)
Ejemplo n.º 19
0
def mean_field_uniform_policy(mfg_game,
                              number_of_iterations,
                              compute_metrics=False):
    del number_of_iterations
    uniform_policy = policy_module.UniformRandomPolicy(mfg_game)
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, uniform_policy)
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            uniform_policy).value(mfg_game.new_initial_state())
        return uniform_policy, policy_value_
    return uniform_policy
Ejemplo n.º 20
0
    def test_policy_value(self, name):
        """Checks if the value of a policy computation works.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        py_value = policy_value.PolicyValue(game, dist, uniform_policy,
                                            value.TabularValueFunction(game))
        py_val = py_value(game.new_initial_state())
        self.assertAlmostEqual(py_val, 27.215850929940448)
Ejemplo n.º 21
0
    def nash_conv(self):
        """Returns the nash conv.

    Returns:
      A list of size `game.num_players()` representing the nash conv for each
      population.
    """
        distrib = distribution.DistributionPolicy(self._game, self._policy)
        pi_value = policy_value.PolicyValue(self._game, distrib, self._policy)
        br_value = best_response_value.BestResponse(self._game, distrib)
        return [
            br_value.eval_state(state) - pi_value.eval_state(state)
            for state in self._game.new_initial_states()
        ]
Ejemplo n.º 22
0
    def test_greedy_cpp(self):
        """Check if the greedy policy works as expected.

    The test checks that a greedy policy with respect to an optimal value is
    an optimal policy.
    """
        game = pyspiel.load_game("mfg_crowd_modelling")
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(game, dist)
        br_val = br_value(game.new_initial_state())

        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        pybr_value = policy_value.PolicyValue(game, dist, greedy_pi)
        pybr_val = pybr_value(game.new_initial_state())
        self.assertAlmostEqual(br_val, pybr_val)
Ejemplo n.º 23
0
def mean_field_fictitious_play(mfg_game,
                               number_of_iterations,
                               compute_metrics=False):
    fp = mean_field_fictitious_play_module.FictitiousPlay(mfg_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        fp.iteration()
    timing = time.time() - tick_time
    fp_policy = fp.get_policy()
    # print('learning done')
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, fp_policy)
        # print('distribution done')
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            fp_policy).value(mfg_game.new_initial_state())
        nash_conv_fp = nash_conv_module.NashConv(mfg_game, fp_policy)
        return timing, fp_policy, nash_conv_fp, policy_value_
    return timing, fp_policy
Ejemplo n.º 24
0
  def __init__(self, game, lr=0.01, root_state=None):
    """Initializes mirror descent.

    Args:
      game: The game,
      lr: The learning rate of mirror descent,
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
    self._game = game
    if root_state is None:
      self._root_states = game.new_initial_states()
    else:
      self._root_states = [root_state]
    self._policy = policy_std.UniformRandomPolicy(game)
    self._distribution = distribution.DistributionPolicy(game, self._policy)
    self._md_step = 0
    self._lr = lr

    self._state_value = collections.defaultdict(float)
    self._cumulative_state_value = collections.defaultdict(float)
Ejemplo n.º 25
0
def online_mirror_descent_sioux_falls(mfg_game,
                                      number_of_iterations,
                                      md_p=None):
    nash_conv_dict = {}
    md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game)
    tick_time = time.time()
    for i in range(number_of_iterations):
        md.iteration()
        md_policy = md.get_policy()
        nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
        nash_conv_dict[i] = nash_conv_md.nash_conv()
        print((f"Iteration {i}, Nash conv: {nash_conv_md.nash_conv()}, "
               "time: {time.time() - tick_time}"))
    timing = time.time() - tick_time
    md_policy = md.get_policy()
    distribution_mfg = distribution_module.DistributionPolicy(
        mfg_game, md_policy)
    policy_value_ = policy_value.PolicyValue(mfg_game, distribution_mfg,
                                             md_policy).value(
                                                 mfg_game.new_initial_state())
    nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
    return timing, md_policy, nash_conv_md, policy_value_, md, nash_conv_dict
Ejemplo n.º 26
0
    def test_greedy(self, name):
        """Check if the greedy policy works as expected.

    The test checks that a greedy policy with respect to an optimal value is
    an optimal policy.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(
            game, dist, value.TabularValueFunction(game))
        br_val = br_value(game.new_initial_state())

        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        pybr_value = policy_value.PolicyValue(game, dist, greedy_pi,
                                              value.TabularValueFunction(game))
        pybr_val = pybr_value(game.new_initial_state())
        self.assertAlmostEqual(br_val, pybr_val)
Ejemplo n.º 27
0
def online_mirror_descent(mfg_game,
                          number_of_iterations,
                          compute_metrics=False,
                          return_policy=False,
                          md_p=None):
    md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        md.iteration()
    timing = time.time() - tick_time
    md_policy = md.get_policy()
    if compute_metrics:
        distribution_mfg = distribution_module.DistributionPolicy(
            mfg_game, md_policy)
        # print('distribution done')
        policy_value_ = policy_value.PolicyValue(
            mfg_game, distribution_mfg,
            md_policy).value(mfg_game.new_initial_state())
        nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy)
        if return_policy:
            return timing, md_policy, nash_conv_md, policy_value_, md
        return timing, md_policy, nash_conv_md, policy_value_
    return timing, md_policy
Ejemplo n.º 28
0
 def test_evolving_trajectory_with_uniform_policy(self):
     """Test evolving distribution."""
     game = pyspiel.load_game("python_mfg_dynamic_routing")
     distribution.DistributionPolicy(game, policy.UniformRandomPolicy(game))
Ejemplo n.º 29
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name,
                             GAME_SETTINGS.get(FLAGS.game_name, {}))
    uniform_policy = policy.UniformRandomPolicy(game)
    mfg_dist = distribution.DistributionPolicy(game, uniform_policy)

    envs = [
        rl_environment.Environment(game,
                                   distribution=mfg_dist,
                                   mfg_population=p)
        for p in range(game.num_players())
    ]
    info_state_size = envs[0].observation_spec()["info_state"][0]
    num_actions = envs[0].action_spec()["num_actions"]

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "learning_rate": FLAGS.rl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": FLAGS.epsilon_start,
        "epsilon_end": FLAGS.epsilon_end,
    }

    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(idx, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs) for idx in range(game.num_players())
    ]
    joint_avg_policy = DQNPolicies(envs, agents)
    if FLAGS.use_checkpoints:
        for agent in agents:
            if agent.has_checkpoint(FLAGS.checkpoint_dir):
                agent.restore(FLAGS.checkpoint_dir)

    for ep in range(FLAGS.num_train_episodes):
        if (ep + 1) % FLAGS.eval_every == 0:
            losses = [agent.loss for agent in agents]
            logging.info("Losses: %s", losses)
            nash_conv_obj = nash_conv.NashConv(game, uniform_policy)
            print(
                str(ep + 1) + " Exact Best Response to Uniform " +
                str(nash_conv_obj.br_values()))
            pi_value = policy_value.PolicyValue(game, mfg_dist,
                                                joint_avg_policy)
            print(
                str(ep + 1) + " DQN Best Response to Uniform " + str([
                    pi_value.eval_state(state)
                    for state in game.new_initial_states()
                ]))
            if FLAGS.use_checkpoints:
                for agent in agents:
                    agent.save(FLAGS.checkpoint_dir)
            logging.info("_____________________________________________")

        for p in range(game.num_players()):
            time_step = envs[p].reset()
            while not time_step.last():
                agent_output = agents[p].step(time_step)
                action_list = [agent_output.action]
                time_step = envs[p].step(action_list)

            # Episode is over, step all agents with final info state.
            agents[p].step(time_step)
Ejemplo n.º 30
0
 def test_basic(self):
     game = pyspiel.load_game("python_mfg_crowd_modelling")
     uniform_policy = policy.UniformRandomPolicy(game)
     dist = distribution.DistributionPolicy(game, uniform_policy)
     state = game.new_initial_state().child(0)
     self.assertAlmostEqual(dist.value(state), 1 / game.size)