def test_online_mirror_descent(self): """Test that online mirror descent can be used on this game.""" mfg_game = pyspiel.load_game("python_mfg_dynamic_routing") omd = mirror_descent.MirrorDescent(mfg_game) for _ in range(_NUMBER_OF_ITERATIONS_TESTS): omd.iteration() nash_conv.NashConv(mfg_game, omd.get_policy())
def test_fp_cpp_game(self): """Checks if mirror descent works.""" game = pyspiel.load_game("mfg_crowd_modelling") md = mirror_descent.MirrorDescent(game) for _ in range(10): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(game, md_policy) self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
def test_fp_python_game(self): """Checks if mirror descent works.""" game = crowd_modelling.MFGCrowdModellingGame() md = mirror_descent.MirrorDescent(game) for _ in range(10): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(game, md_policy) self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
def test_fp(self, name): """Checks if mirror descent works.""" game = pyspiel.load_game(name) md = mirror_descent.MirrorDescent(game, value.TabularValueFunction(game)) for _ in range(10): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(game, md_policy) self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
def test_online_mirror_descent_convergence(self): """Test that online mirror descent converges to equilibrium in default game.""" mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) omd = mirror_descent.MirrorDescent(mfg_game, lr=1) for _ in range(50): omd.iteration() self.assertAlmostEqual( nash_conv.NashConv(mfg_game, omd.get_policy()).nash_conv(), 0)
def test_learning_and_applying_mfg_policy_in_n_player_game(self): """Test converting learnt MFG policy default game.""" # learning the Braess MFG Nash equilibrium mfg_game = pyspiel.load_game("python_mfg_dynamic_routing") omd = mirror_descent.MirrorDescent(mfg_game, lr=1) for _ in range(10): omd.iteration() mfg_policy = omd.get_policy() n_player_game = pyspiel.load_game("python_dynamic_routing") mfg_derived_policy = (dynamic_routing_to_mean_field_game. DerivedNPlayerPolicyFromMeanFieldPolicy( n_player_game, mfg_policy)) expected_game_score.policy_value(n_player_game.new_initial_state(), mfg_derived_policy)
def online_mirror_descent_sioux_falls(mfg_game, number_of_iterations, md_p=None): nash_conv_dict = {} md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game) tick_time = time.time() for i in range(number_of_iterations): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) nash_conv_dict[i] = nash_conv_md.nash_conv() print((f"Iteration {i}, Nash conv: {nash_conv_md.nash_conv()}, " "time: {time.time() - tick_time}")) timing = time.time() - tick_time md_policy = md.get_policy() distribution_mfg = distribution_module.DistributionPolicy( mfg_game, md_policy) policy_value_ = policy_value.PolicyValue(mfg_game, distribution_mfg, md_policy).value( mfg_game.new_initial_state()) nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) return timing, md_policy, nash_conv_md, policy_value_, md, nash_conv_dict
def online_mirror_descent(mfg_game, number_of_iterations, compute_metrics=False, return_policy=False, md_p=None): md = md_p if md_p else mirror_descent.MirrorDescent(mfg_game) tick_time = time.time() for _ in range(number_of_iterations): md.iteration() timing = time.time() - tick_time md_policy = md.get_policy() if compute_metrics: distribution_mfg = distribution_module.DistributionPolicy( mfg_game, md_policy) # print('distribution done') policy_value_ = policy_value.PolicyValue( mfg_game, distribution_mfg, md_policy).value(mfg_game.new_initial_state()) nash_conv_md = nash_conv_module.NashConv(mfg_game, md_policy) if return_policy: return timing, md_policy, nash_conv_md, policy_value_, md return timing, md_policy, nash_conv_md, policy_value_ return timing, md_policy
def main(argv: Sequence[str]) -> None: # TODO(perolat): move to an example directory. if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') mfg_game = pyspiel.load_game(FLAGS.game, GAME_SETTINGS.get(FLAGS.game, {})) mfg_state = mfg_game.new_initial_state() print('Playing a single arbitrary trajectory') while not mfg_state.is_terminal(): print('State obs string:', mfg_state.observation_string(0)) if mfg_state.current_player() == pyspiel.PlayerId.CHANCE: action_list, prob_list = zip(*mfg_state.chance_outcomes()) action = np.random.choice(action_list, p=prob_list) mfg_state.apply_action(action) elif mfg_state.current_player() == pyspiel.PlayerId.MEAN_FIELD: dist_to_register = mfg_state.distribution_support() n_states = len(dist_to_register) dist = [1.0 / n_states for _ in range(n_states)] mfg_state.update_distribution(dist) else: legal_list = mfg_state.legal_actions() action = np.random.choice(legal_list) mfg_state.apply_action(action) print('compute nashconv') uniform_policy = policy.UniformRandomPolicy(mfg_game) nash_conv_fp = nash_conv.NashConv(mfg_game, uniform_policy) print('Nashconv:', nash_conv_fp.nash_conv()) print('compute distribution') mfg_dist = distribution.DistributionPolicy(mfg_game, uniform_policy) br_value = best_response_value.BestResponse( mfg_game, mfg_dist, value.TabularValueFunction(mfg_game)) py_value = policy_value.PolicyValue(mfg_game, mfg_dist, uniform_policy, value.TabularValueFunction(mfg_game)) print( 'Value of a best response policy to a uniform policy ' '(computed with best_response_value)', br_value(mfg_game.new_initial_state())) print('Value of the uniform policy:', py_value(mfg_game.new_initial_state())) greedy_pi = greedy_policy.GreedyPolicy(mfg_game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(mfg_game, mfg_dist, greedy_pi, value.TabularValueFunction(mfg_game)) print( 'Value of a best response policy to a uniform policy (computed at the ' 'value of the greedy policy of the best response value)', pybr_value(mfg_game.new_initial_state())) print('merge') merged_pi = fictitious_play.MergedPolicy( mfg_game, list(range(mfg_game.num_players())), [uniform_policy, greedy_pi], [mfg_dist, distribution.DistributionPolicy(mfg_game, greedy_pi)], [0.5, 0.5]) merged_pi_value = policy_value.PolicyValue( mfg_game, mfg_dist, merged_pi, value.TabularValueFunction(mfg_game)) print(br_value(mfg_game.new_initial_state())) print(py_value(mfg_game.new_initial_state())) print(merged_pi_value(mfg_game.new_initial_state())) print((br_value(mfg_game.new_initial_state()) + py_value(mfg_game.new_initial_state())) / 2) print('fp') fp = fictitious_play.FictitiousPlay(mfg_game) for j in range(100): print('Iteration', j, 'of fictitious play') fp.iteration() fp_policy = fp.get_policy() nash_conv_fp = nash_conv.NashConv(mfg_game, fp_policy) print('Nashconv of the current FP policy', nash_conv_fp.nash_conv()) print('md') md = mirror_descent.MirrorDescent(mfg_game, value.TabularValueFunction(mfg_game)) for j in range(10): print('Iteration', j, 'of mirror descent') md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(mfg_game, md_policy) print('Nashconv of the current MD policy', nash_conv_md.nash_conv())