def iteration(self, rl_br_agent=None, learning_rate=None): """Returns a new `TabularPolicy` equivalent to this policy. Args: rl_br_agent: An instance of the RL approximation method to use to compute the best response value for each iteration. If none provided, the exact value is computed. learning_rate: The learning rate. """ self._fp_step += 1 distrib = distribution.DistributionPolicy(self._game, self._policy) if rl_br_agent: joint_avg_policy = rl_agent_policy.RLAgentPolicy( self._game, rl_br_agent, rl_br_agent.player_id, use_observation=True) br_value = policy_value.PolicyValue(self._game, distrib, joint_avg_policy) else: br_value = best_response_value.BestResponse( self._game, distrib, value.TabularValueFunction(self._game)) greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value) greedy_pi = greedy_pi.to_tabular(states=self._states) distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi) weight = learning_rate if learning_rate else 1.0 / (self._fp_step + 1) self._policy = MergedPolicy( self._game, list(range(self._game.num_players())), [self._policy, greedy_pi], [distrib, distrib_greedy], [1.0 - weight, weight]).to_tabular(states=self._states)
def test_rl_environment(self, game_name): """Check that the RL environment runs for a few trajectories.""" game = pyspiel.load_game(game_name) uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) envs = [ rl_environment.Environment( game, mfg_distribution=mfg_dist, mfg_population=p) for p in range(game.num_players()) ] for p, env in enumerate(envs): for _ in range(FLAGS.rl_env_simulations): time_step = env.reset() while not time_step.last(): a = random.choice(time_step.observations['legal_actions'][p]) time_step = env.step([a]) env = envs[0] self.assertEqual(env.mfg_distribution, mfg_dist) # Update the distribution. new_mfg_dist = distribution.DistributionPolicy(game, uniform_policy) env.update_mfg_distribution(new_mfg_dist) self.assertEqual(env.mfg_distribution, new_mfg_dist)
def test_braess_paradox(self): """Test that Braess paradox can be reproduced with the mean field game.""" mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) class NashEquilibriumBraess(policy.Policy): def action_probabilities(self, state, player_id=None): legal_actions = state.legal_actions() if not legal_actions: return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0} elif len(legal_actions) == 1: return {legal_actions[0]: 1.0} else: if legal_actions[0] == 2: return {2: 0.75, 3: 0.25} elif legal_actions[0] == 4: return {4: 2 / 3, 5: 1 / 3} raise ValueError(f"{legal_actions} is not correct.") ne_policy = NashEquilibriumBraess(mfg_game, 1) self.assertEqual( -policy_value.PolicyValue( mfg_game, distribution.DistributionPolicy(mfg_game, ne_policy), ne_policy).value(mfg_game.new_initial_state()), 3.75) self.assertEqual( nash_conv.NashConv(mfg_game, ne_policy).nash_conv(), 0.0) class SocialOptimumBraess(policy.Policy): def action_probabilities(self, state, player_id=None): legal_actions = state.legal_actions() if not legal_actions: return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0} elif len(legal_actions) == 1: return {legal_actions[0]: 1.0} else: if legal_actions[0] == 2: return {2: 0.5, 3: 0.5} elif legal_actions[0] == 4: return {5: 1.0} raise ValueError(f"{legal_actions} is not correct.") so_policy = SocialOptimumBraess(mfg_game, 1) self.assertEqual( -policy_value.PolicyValue( mfg_game, distribution.DistributionPolicy(mfg_game, so_policy), so_policy).value(mfg_game.new_initial_state()), 3.5) self.assertEqual( nash_conv.NashConv(mfg_game, so_policy).nash_conv(), 0.75)
def test_dqn_fp_python_game(self): """Checks if fictitious play with DQN-based value function works.""" game = crowd_modelling.MFGCrowdModellingGame() dfp = fictitious_play.FictitiousPlay(game) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) envs = [ rl_environment.Environment(game, mfg_distribution=dist, mfg_population=p) for p in range(game.num_players()) ] dqn_agent = dqn.DQN( 0, state_representation_size=envs[0].observation_spec()["info_state"] [0], num_actions=envs[0].action_spec()["num_actions"], hidden_layers_sizes=[256, 128, 64], replay_buffer_capacity=100, batch_size=5, epsilon_start=0.02, epsilon_end=0.01) for _ in range(10): dfp.iteration(rl_br_agent=dqn_agent) dfp_policy = dfp.get_policy() nash_conv_dfp = nash_conv.NashConv(game, dfp_policy) self.assertAlmostEqual(nash_conv_dfp.nash_conv(), 1.0558451955622807)
def __init__(self, game, policy: policy_std.Policy, root_state=None): """Initializes the nash conv. Args: game: The game to analyze. policy: A `policy.Policy` object. root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game self._policy = policy if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._distrib = distribution.DistributionPolicy(self._game, self._policy, root_state=root_state) self._pi_value = policy_value.PolicyValue(self._game, self._distrib, self._policy, value.TabularValueFunction( self._game), root_state=root_state) self._br_value = best_response_value.BestResponse( self._game, self._distrib, value.TabularValueFunction(self._game), root_state=root_state)
def nash_conv(self): """Returns the nash conv.""" distrib = distribution.DistributionPolicy(self._game, self._policy) pi_value = policy_value.PolicyValue(self._game, distrib, self._policy) br_value = best_response_value.BestResponse(self._game, distrib) return (br_value.eval_state(self._game.new_initial_state()) - pi_value.eval_state(self._game.new_initial_state()))
def __init__(self, game, state_value: Optional[value.ValueFunction] = None, lr=0.01, root_state=None): """Initializes mirror descent. Args: game: The game, state_value: A state value function. Default to TabularValueFunction. lr: The learning rate of mirror descent, root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._policy = policy_std.UniformRandomPolicy(game) self._distribution = distribution.DistributionPolicy( game, self._policy) self._md_step = 0 self._lr = lr self._state_value = (state_value if state_value else value.TabularValueFunction(game)) self._cumulative_state_value = value.TabularValueFunction(game)
def iteration(self): """Returns a new `TabularPolicy` equivalent to this policy.""" self._fp_step += 1 distrib = distribution.DistributionPolicy(self._game, self._policy) br_value = best_response_value.BestResponse(self._game, distrib) greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value) greedy_pi = greedy_pi.to_tabular() distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi) self._policy = MergedPolicy( self._game, list(range(self._game.num_players())), [self._policy, greedy_pi], [distrib, distrib_greedy], [1.0*self._fp_step/(self._fp_step+1), 1.0/(self._fp_step+1)] ).to_tabular()
def test_state_support_outside_distrib(self): game = pyspiel.load_game( "mfg_crowd_modelling_2d", { "initial_distribution": "[0|0]", "initial_distribution_value": "[1.]", }) uniform_policy = policy.UniformRandomPolicy(game) _ = distribution.DistributionPolicy(game, uniform_policy)
def test_cpp_game(self): """Checks if the value of a policy computation works.""" game = pyspiel.load_game("mfg_crowd_modelling") uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) py_value = policy_value.PolicyValue(game, dist, uniform_policy) py_val = py_value(game.new_initial_state()) self.assertAlmostEqual(py_val, 29.92843602293449)
def test_python_game(self): """Checks if the value of a policy computation works.""" game = crowd_modelling.MFGCrowdModellingGame() uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, dist) br_val = br_value(game.new_initial_state()) self.assertAlmostEqual(br_val, 30.029387484327486)
def test_multi_pop(self): game = pyspiel.load_game("python_mfg_predator_prey") self.assertEqual(game.num_players(), 3) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) for pop in range(3): self.assertAlmostEqual( dist.value(game.new_initial_state_for_population(pop)), 1.)
def test_cpp_game(self): """Checks if the value of a policy computation works.""" game = pyspiel.load_game("mfg_crowd_modelling") uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, dist) br_val = br_value(game.new_initial_state()) self.assertAlmostEqual(br_val, 33.09846599803991)
def test_python_game(self): """Checks if the value of a policy computation works.""" game = crowd_modelling.MFGCrowdModellingGame() uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) py_value = policy_value.PolicyValue(game, dist, uniform_policy) py_val = py_value(game.new_initial_state()) self.assertAlmostEqual(py_val, 27.215850929940448)
def test_best_response(self, name): """Checks if the value of a policy computation works.""" game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) self.assertAlmostEqual(br_val, 30.029387484327486)
def iteration(self): """an iteration of Mirror Descent.""" self._md_step += 1 self._state_value = collections.defaultdict(float) for state in self._root_states: self.eval_state(state) self._policy = ProjectedPolicy(self._game, list(range(self._game.num_players())), self._cumulative_state_value) self._distribution = distribution.DistributionPolicy( self._game, self._policy)
def test_average(self): """Test the average of policies. Here we test that the average of values is the value of the average policy. """ game = crowd_modelling.MFGCrowdModellingGame() uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, mfg_dist) py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() merged_pi = fictitious_play.MergedPolicy( game, list(range(game.num_players())), [uniform_policy, greedy_pi], [mfg_dist, distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5]) merged_pi_value = policy_value.PolicyValue(game, mfg_dist, merged_pi) self.assertAlmostEqual(merged_pi_value(game.new_initial_state()), (br_value(game.new_initial_state()) + py_value(game.new_initial_state())) / 2)
def iteration(self, learning_rate=None): """an iteration of Mirror Descent.""" self._md_step += 1 # TODO(sertan): Fix me. self._state_value = value.TabularValueFunction(self._game) for state in self._root_states: self.eval_state(state, learning_rate if learning_rate else self._lr) self._policy = ProjectedPolicy(self._game, list(range(self._game.num_players())), self._cumulative_state_value) self._distribution = distribution.DistributionPolicy( self._game, self._policy)
def mean_field_uniform_policy(mfg_game, number_of_iterations, compute_metrics=False): del number_of_iterations uniform_policy = policy_module.UniformRandomPolicy(mfg_game) if compute_metrics: distribution_mfg = distribution_module.DistributionPolicy( mfg_game, uniform_policy) policy_value_ = policy_value.PolicyValue( mfg_game, distribution_mfg, uniform_policy).value(mfg_game.new_initial_state()) return uniform_policy, policy_value_ return uniform_policy
def test_policy_value(self, name): """Checks if the value of a policy computation works. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) py_value = policy_value.PolicyValue(game, dist, uniform_policy, value.TabularValueFunction(game)) py_val = py_value(game.new_initial_state()) self.assertAlmostEqual(py_val, 27.215850929940448)
def nash_conv(self): """Returns the nash conv. Returns: A list of size `game.num_players()` representing the nash conv for each population. """ distrib = distribution.DistributionPolicy(self._game, self._policy) pi_value = policy_value.PolicyValue(self._game, distrib, self._policy) br_value = best_response_value.BestResponse(self._game, distrib) return [ br_value.eval_state(state) - pi_value.eval_state(state) for state in self._game.new_initial_states() ]
def test_greedy_cpp(self): """Check if the greedy policy works as expected. The test checks that a greedy policy with respect to an optimal value is an optimal policy. """ game = pyspiel.load_game("mfg_crowd_modelling") uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, dist) br_val = br_value(game.new_initial_state()) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(game, dist, greedy_pi) pybr_val = pybr_value(game.new_initial_state()) self.assertAlmostEqual(br_val, pybr_val)
def mean_field_fictitious_play(mfg_game, number_of_iterations, compute_metrics=False): fp = mean_field_fictitious_play_module.FictitiousPlay(mfg_game) tick_time = time.time() for _ in range(number_of_iterations): fp.iteration() timing = time.time() - tick_time fp_policy = fp.get_policy() # print('learning done') if compute_metrics: distribution_mfg = distribution_module.DistributionPolicy( mfg_game, fp_policy) # print('distribution done') policy_value_ = policy_value.PolicyValue( mfg_game, distribution_mfg, fp_policy).value(mfg_game.new_initial_state()) nash_conv_fp = nash_conv_module.NashConv(mfg_game, fp_policy) return timing, fp_policy, nash_conv_fp, policy_value_ return timing, fp_policy
def __init__(self, game, lr=0.01, root_state=None): """Initializes mirror descent. Args: game: The game, lr: The learning rate of mirror descent, root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._policy = policy_std.UniformRandomPolicy(game) self._distribution = distribution.DistributionPolicy(game, self._policy) self._md_step = 0 self._lr = lr self._state_value = collections.defaultdict(float) self._cumulative_state_value = collections.defaultdict(float)
def online_mirror_descent_sioux_falls(mfg_game, number_of_iterations, md_p=None): nash_conv_dict = {} md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game) tick_time = time.time() for i in range(number_of_iterations): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) nash_conv_dict[i] = nash_conv_md.nash_conv() print((f"Iteration {i}, Nash conv: {nash_conv_md.nash_conv()}, " "time: {time.time() - tick_time}")) timing = time.time() - tick_time md_policy = md.get_policy() distribution_mfg = distribution_module.DistributionPolicy( mfg_game, md_policy) policy_value_ = policy_value.PolicyValue(mfg_game, distribution_mfg, md_policy).value( mfg_game.new_initial_state()) nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) return timing, md_policy, nash_conv_md, policy_value_, md, nash_conv_dict
def test_greedy(self, name): """Check if the greedy policy works as expected. The test checks that a greedy policy with respect to an optimal value is an optimal policy. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(game, dist, greedy_pi, value.TabularValueFunction(game)) pybr_val = pybr_value(game.new_initial_state()) self.assertAlmostEqual(br_val, pybr_val)
def online_mirror_descent(mfg_game, number_of_iterations, compute_metrics=False, return_policy=False, md_p=None): md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game) tick_time = time.time() for _ in range(number_of_iterations): md.iteration() timing = time.time() - tick_time md_policy = md.get_policy() if compute_metrics: distribution_mfg = distribution_module.DistributionPolicy( mfg_game, md_policy) # print('distribution done') policy_value_ = policy_value.PolicyValue( mfg_game, distribution_mfg, md_policy).value(mfg_game.new_initial_state()) nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) if return_policy: return timing, md_policy, nash_conv_md, policy_value_, md return timing, md_policy, nash_conv_md, policy_value_ return timing, md_policy
def test_evolving_trajectory_with_uniform_policy(self): """Test evolving distribution.""" game = pyspiel.load_game("python_mfg_dynamic_routing") distribution.DistributionPolicy(game, policy.UniformRandomPolicy(game))
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name, GAME_SETTINGS.get(FLAGS.game_name, {})) uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) envs = [ rl_environment.Environment(game, distribution=mfg_dist, mfg_population=p) for p in range(game.num_players()) ] info_state_size = envs[0].observation_spec()["info_state"][0] num_actions = envs[0].action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "learning_rate": FLAGS.rl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } # pylint: disable=g-complex-comprehension agents = [ dqn.DQN(idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(game.num_players()) ] joint_avg_policy = DQNPolicies(envs, agents) if FLAGS.use_checkpoints: for agent in agents: if agent.has_checkpoint(FLAGS.checkpoint_dir): agent.restore(FLAGS.checkpoint_dir) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] logging.info("Losses: %s", losses) nash_conv_obj = nash_conv.NashConv(game, uniform_policy) print( str(ep + 1) + " Exact Best Response to Uniform " + str(nash_conv_obj.br_values())) pi_value = policy_value.PolicyValue(game, mfg_dist, joint_avg_policy) print( str(ep + 1) + " DQN Best Response to Uniform " + str([ pi_value.eval_state(state) for state in game.new_initial_states() ])) if FLAGS.use_checkpoints: for agent in agents: agent.save(FLAGS.checkpoint_dir) logging.info("_____________________________________________") for p in range(game.num_players()): time_step = envs[p].reset() while not time_step.last(): agent_output = agents[p].step(time_step) action_list = [agent_output.action] time_step = envs[p].step(action_list) # Episode is over, step all agents with final info state. agents[p].step(time_step)
def test_basic(self): game = pyspiel.load_game("python_mfg_crowd_modelling") uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) state = game.new_initial_state().child(0) self.assertAlmostEqual(dist.value(state), 1 / game.size)