def aggregate_policies(game, total_policies, probabilities_of_playing_policies): """Aggregate the players' policies. Specifically, returns a single callable policy object that is realization-equivalent to playing total_policies with probabilities_of_playing_policies. I.e., aggr_policy is a joint policy that can be called at any information state [via action_probabilities(state, player_id)]. Args: game: The open_spiel game. total_policies: A list of list of all policy.Policy strategies used for training, where the n-th entry of the main list is a list of policies available to the n-th player. probabilities_of_playing_policies: A list of arrays representing, per player, the probabilities of playing each policy in total_policies for the same player. Returns: A callable object representing the policy. """ aggregator = policy_aggregator.PolicyAggregator(game) return aggregator.aggregate(range(len(probabilities_of_playing_policies)), total_policies, probabilities_of_playing_policies)
def main(unused_argv): env = rl_environment.Environment(FLAGS.game_name) policies = [[ policy.TabularPolicy(env.game).copy_with_noise(alpha=float(i), beta=1.0) for i in range(2) ] for _ in range(2)] # pylint: disable=g-complex-comprehension probabilities = [ list(np.ones(len(policies[i])) / len(policies[i])) for i in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policies = pol_ag.aggregate([0, 1], policies, probabilities) exploitabilities = exploitability.nash_conv(env.game, aggr_policies) print("Exploitability : {}".format(exploitabilities)) print(policies[0][0].action_probability_array) print(policies[0][1].action_probability_array) print(aggr_policies.policy) print("\nCopy Example") mother_policy = policy.TabularPolicy(env.game).copy_with_noise(1, 10) policies = [[mother_policy.__copy__() for _ in range(2)] for _ in range(2)] probabilities = [ list(np.ones(len(policies)) / len(policies)) for _ in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0], policies, probabilities) for state, value in aggr_policy.policy[0].items(): polici = mother_policy.policy_for_key(state) value_normal = { action: probability for action, probability in enumerate(polici) if probability > 0 } for key in value.keys(): print( "State : {}. Key : {}. Aggregated : {}. Real : {}. Passed : {}" .format(state, key, value[key], value_normal[key], np.abs(value[key] - value_normal[key]) < 1e-8))
def gpsro_looper(env, oracle, agents): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector if FLAGS.meta_strategy_method == "alpharank": # TODO(somidshafiei): Implement epsilon-sweep for Openspiel alpharank. print("\n") print( "==================================================================\n" "============================ Warning =============================\n" "==================================================================\n" ) print( "Selected alpharank. Warning : Current alpharank version is unstable." " It can raise errors because of infinite / nans elements in arrays. " "A fix should be uploaded in upcoming openspiel iterations.") print("\n") g_psro_solver = psro_v2.PSROSolver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game) start_time = time.time() for gpsro_iteration in range(FLAGS.gpsro_iterations): if FLAGS.verbose: print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) g_psro_solver.iteration() meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) _ = print_policy_analysis(policies, env.game, FLAGS.verbose) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print("Exploitabilities per player : {}".format(expl_per_player))
def gpsro_looper(env, oracle, agents): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector g_psro_solver = psro_v2.PSROSolver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game) start_time = time.time() for gpsro_iteration in range(FLAGS.gpsro_iterations): if FLAGS.verbose: print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) g_psro_solver.iteration() meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) # The following lines only work for sequential games for the moment. if env.game.get_type( ).dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL: aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) _ = print_policy_analysis(policies, env.game, FLAGS.verbose) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print( "Exploitabilities per player : {}".format(expl_per_player))
def test_policy_aggregation_random(self, game_name): env = rl_environment.Environment(game_name) policies = [[policy.UniformRandomPolicy(env.game) for _ in range(2)] for _ in range(2)] probabilities = [ list(np.ones(len(policies)) / len(policies)) for _ in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0], policies, probabilities) for item in aggr_policy.policy[0].items(): _, probs = zip(*item[1].items()) const_probs = tuple([probs[0]] * len(probs)) self.assertEqual(probs, const_probs)
def test_policy_aggregation_tabular_randinit(self, game_name): env = rl_environment.Environment(game_name) mother_policy = policy.TabularPolicy(env.game).copy_with_noise( 1, 10, np.random.RandomState(0)) policies = [[mother_policy.__copy__() for _ in range(2)] for _ in range(2)] probabilities = [ list(np.ones(len(policies)) / len(policies)) for _ in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0], policies, probabilities) for state, value in aggr_policy.policy[0].items(): polici = mother_policy.policy_for_key(state) value_normal = { action: probability for action, probability in enumerate(polici) if probability > 0 } for key in value_normal.keys(): self.assertAlmostEqual(value[key], value_normal[key], 8)
def gpsro_looper(env, oracle, agents, writer, quiesce=False, checkpoint_dir=None, seed=None): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector if not quiesce: solver = psro_v2.PSROSolver elif FLAGS.sparse_quiesce: solver = quiesce_sparse.PSROQuiesceSolver else: solver = PSROQuiesceSolver g_psro_solver = solver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game, checkpoint_dir=checkpoint_dir, filtering_method=FLAGS.filtering_method, strategy_set_size=FLAGS.strategy_set_size ) last_meta_prob = [np.array([1]) for _ in range(FLAGS.n_players)] last_meta_game = g_psro_solver.get_meta_game() #atexit.register(save_at_termination, solver=g_psro_solver, file_for_meta_game=checkpoint_dir+'/meta_game.pkl') start_time = time.time() for gpsro_iteration in range(1,FLAGS.gpsro_iterations+1): if FLAGS.verbose: print("\n===========================\n") print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) train_reward_curve = g_psro_solver.iteration(seed=seed) meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() nash_meta_probabilities = g_psro_solver.get_nash_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: # print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) print("Nash Probabilities : {}".format(nash_meta_probabilities)) aggregator = policy_aggregator.PolicyAggregator(env.game) ## Using NE-based NashConv aggr_policies_Mike = aggregator.aggregate( range(FLAGS.n_players), policies, nash_meta_probabilities) ## Using heuristic-based NashConv aggr_policies = aggregator.aggregate( range(FLAGS.n_players), policies, meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) nash_Mike, expl_per_player = exploitability.nash_conv( env.game, aggr_policies_Mike, return_only_nash_conv=False) unique_policies = print_policy_analysis(policies, env.game, FLAGS.verbose) for p, cur_set in enumerate(unique_policies): writer.add_scalar('p'+str(p)+'_unique_p',len(cur_set),gpsro_iteration) if gpsro_iteration % 10 ==0: save_at_termination(solver=g_psro_solver, file_for_meta_game=checkpoint_dir+'/meta_game.pkl') # save_strategies(solver=g_psro_solver, checkpoint_dir=checkpoint_dir) beneficial_deviation = print_beneficial_deviation_analysis(last_meta_game, meta_game, last_meta_prob, FLAGS.verbose) last_meta_prob, last_meta_game = meta_probabilities, meta_game for p in range(len(beneficial_deviation)): writer.add_scalar('p'+str(p)+'_beneficial_dev',int(beneficial_deviation[p]),gpsro_iteration) writer.add_scalar('beneficial_devs',sum(beneficial_deviation),gpsro_iteration) # if FLAGS.log_train and (gpsro_iteration<=10 or gpsro_iteration%5==0): # for p in range(len(train_reward_curve)): # for p_i in range(len(train_reward_curve[p])): # writer.add_scalar('player'+str(p)+'_'+str(gpsro_iteration),train_reward_curve[p][p_i],p_i) for p in range(len(expl_per_player)): writer.add_scalar('player'+str(p)+'_exp', expl_per_player[p], gpsro_iteration) writer.add_scalar('exp', exploitabilities, gpsro_iteration) writer.add_scalar('exp_Mike', nash_Mike, gpsro_iteration) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print("Exploitabilities per player : {}".format(expl_per_player))
def gpsro_looper(env, oracle, oracle_list, agents, writer, quiesce=False, checkpoint_dir=None, seed=None, heuristic_list=None): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector if not quiesce: solver = psro_v2.PSROSolver elif FLAGS.sparse_quiesce: solver = quiesce_sparse.PSROQuiesceSolver else: solver = PSROQuiesceSolver g_psro_solver = solver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, fast_oracle_period=FLAGS.fast_oracle_period, slow_oracle_period=FLAGS.slow_oracle_period, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game, oracle_list=oracle_list, checkpoint_dir=checkpoint_dir, exp3=FLAGS.exp3, standard_regret=FLAGS.standard_regret, heuristic_list=heuristic_list, gamma=FLAGS.exploration_gamma, switch_heuristic_regardless_of_oracle=FLAGS.switch_heuristic_regardless_of_oracle, abs_value=FLAGS.abs_reward, kl_reg=FLAGS.kl_regularization) last_meta_prob = [np.array([1]) for _ in range(FLAGS.n_players)] last_meta_game = g_psro_solver.get_meta_game() start_time = time.time() heuristic_print = [] for gpsro_iteration in range(1,FLAGS.gpsro_iterations+1): if FLAGS.verbose: print("\n===========================\n") print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) #train_reward_curve = g_psro_solver.iteration(seed=seed) # iteration function for strategy exploration if FLAGS.switch_blocks: train_reward_curve = g_psro_solver.se_iteration_for_blocks(seed=seed) for i, heuristic in enumerate(heuristic_list): writer.add_scalar(heuristic, g_psro_solver._heuristic_selector.weights[i], gpsro_iteration) print("Current selector weights:", g_psro_solver._heuristic_selector.weights) else: train_reward_curve = g_psro_solver.se_iteration(seed=seed) meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() nash_meta_probabilities = g_psro_solver.get_nash_strategies() policies = g_psro_solver.get_policies() if FLAGS.verbose: # print("Meta game : {}".format(meta_game)) print("{} Probabilities : {}".format(g_psro_solver._meta_strategy_method_name, meta_probabilities)) print("Nash Probabilities : {}".format(nash_meta_probabilities)) heuristic_print.append((gpsro_iteration + 1, g_psro_solver._meta_strategy_method_name)) print("Heuristics run:", heuristic_print) if gpsro_iteration >= 2: for player in range(len(nash_meta_probabilities)): kl_conv = 0 p = np.append(g_psro_solver._NE_list[-2][player], 0) q = g_psro_solver._NE_list[-1][player] kl = smoothing_kl(p, q) kl_conv += kl writer.add_scalar("player_" + str(player), kl, gpsro_iteration) writer.add_scalar("kl_conv", kl_conv, gpsro_iteration) # The following lines only work for sequential games for the moment. ######### calculate exploitability then log it if env.game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL: aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, nash_meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) for p in range(len(expl_per_player)): writer.add_scalar('player'+str(p)+'_exp', expl_per_player[p],gpsro_iteration) writer.add_scalar('exp', exploitabilities, gpsro_iteration) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print("Exploitabilities per player : {}".format(expl_per_player)) ######### analyze unique policy unique_policies = print_policy_analysis(policies, env.game, FLAGS.verbose) for p, cur_set in enumerate(unique_policies): writer.add_scalar('p'+str(p)+'_unique_p',len(cur_set),gpsro_iteration) ######### record meta_game into pkl if gpsro_iteration % 5 == 0: save_at_termination(solver=g_psro_solver, file_for_meta_game=checkpoint_dir+'/meta_game.pkl') # save_strategies(solver=g_psro_solver, checkpoint_dir=checkpoint_dir) ######### analyze if this iteration found beneficial deviation beneficial_deviation = print_beneficial_deviation_analysis(last_meta_game, meta_game, last_meta_prob, FLAGS.verbose) last_meta_prob, last_meta_game = nash_meta_probabilities, meta_game #for p in range(len(beneficial_deviation)): # writer.add_scalar('p'+str(p)+'_beneficial_dev',int(beneficial_deviation[p]),gpsro_iteration) writer.add_scalar('beneficial_devs', sum(beneficial_deviation), gpsro_iteration) ######### analyze if the fast oracle has found beneficial deviation from slow oracle if FLAGS.switch_fast_slow: period = FLAGS.fast_oracle_period + FLAGS.slow_oracle_period if gpsro_iteration % period == 0: beneficial_deviation = print_beneficial_deviation_analysis(last_slow_meta_game, meta_game, last_slow_meta_prob, verbose=False) writer.add_scalar('fast_bef_dev_from_slow', sum(beneficial_deviation),gpsro_iteration) print('fast oracle dev from slow', beneficial_deviation) elif gpsro_iteration % period <= FLAGS.slow_oracle_period: last_slow_meta_prob, last_slow_meta_game = nash_meta_probabilities, meta_game print('slow oracle DQN running') else: print('fast oracle ARS running')
def gpsro_looper(env, oracle, agents, writer, quiesce=False, checkpoint_dir=None, seed=None, dqn_iters=None): """Initializes and executes the GPSRO training loop.""" sample_from_marginals = True # TODO(somidshafiei) set False for alpharank training_strategy_selector = FLAGS.training_strategy_selector or strategy_selectors.probabilistic_strategy_selector if not quiesce: solver = psro_v2.PSROSolver elif FLAGS.sparse_quiesce: solver = quiesce_sparse.PSROQuiesceSolver else: solver = PSROQuiesceSolver g_psro_solver = solver( env.game, oracle, initial_policies=agents, training_strategy_selector=training_strategy_selector, rectifier=FLAGS.rectifier, sims_per_entry=FLAGS.sims_per_entry, number_policies_selected=FLAGS.number_policies_selected, meta_strategy_method=FLAGS.meta_strategy_method, prd_iterations=50000, prd_gamma=1e-10, sample_from_marginals=sample_from_marginals, symmetric_game=FLAGS.symmetric_game, checkpoint_dir=checkpoint_dir) last_meta_prob = [np.array([1]) for _ in range(FLAGS.n_players)] last_meta_game = g_psro_solver.get_meta_game() # atexit.register(save_at_termination, solver=g_psro_solver, file_for_meta_game=checkpoint_dir+'/meta_game.pkl') start_time = time.time() g_psro_solver.stopping_time = dqn_iters #param_dict = {'num_directions': [20, 40, 60, 80], # 'num_best_directions': [15, 20, 40, 80], # 'ars_learning_rate': [0.01, 0.015, 0.03, 0.07], # 'noise': [0.01,0.03,0.07,0.1,0.3,0.5]} param_dict = { 'num_directions': [FLAGS.num_directions], 'num_best_directions': [15, 20, 40, 80], 'ars_learning_rate': [0.01, 0.015, 0.03, 0.07], 'noise': [0.01, 0.025, 0.07, 0.1, 0.3, 0.5] } params_list = iter(grid_search(param_dict, search_ars_bd=True)) for gpsro_iteration in range(1, FLAGS.gpsro_iterations + 1): if FLAGS.verbose: print("\n===========================\n") print("Iteration : {}".format(gpsro_iteration)) print("Time so far: {}".format(time.time() - start_time)) if (gpsro_iteration - dqn_iters) % 2 == 1 and gpsro_iteration > dqn_iters: next_ars_param = next(params_list) if next_ars_param: print('\n*****switching ARS parameter******') ars_oracle, _ = init_ars_responder(None, env, next_ars_param) g_psro_solver._oracle = ars_oracle else: break train_reward_curve = g_psro_solver.iteration(seed=seed) meta_game = g_psro_solver.get_meta_game() meta_probabilities = g_psro_solver.get_meta_strategies() nash_meta_probabilities = g_psro_solver.get_nash_strategies() if gpsro_iteration == dqn_iters: still_nash_meta_game = meta_game still_nash_meta_prob = meta_probabilities still_nash_pol_ind = np.arange(meta_game[0].shape[0]) policies = g_psro_solver.get_policies() if FLAGS.verbose: print("Meta game : {}".format(meta_game)) print("Probabilities : {}".format(meta_probabilities)) print("Nash Probabilities : {}".format(nash_meta_probabilities)) aggregator = policy_aggregator.PolicyAggregator(env.game) aggr_policies = aggregator.aggregate(range(FLAGS.n_players), policies, nash_meta_probabilities) exploitabilities, expl_per_player = exploitability.nash_conv( env.game, aggr_policies, return_only_nash_conv=False) unique_policies = print_policy_analysis(policies, env.game, FLAGS.verbose) for p, cur_set in enumerate(unique_policies): writer.add_scalar('p' + str(p) + '_unique_p', len(cur_set), gpsro_iteration) if gpsro_iteration % 10 == 0: save_at_termination(solver=g_psro_solver, file_for_meta_game=checkpoint_dir + '/meta_game.pkl') # record ARS logging if (gpsro_iteration - dqn_iters) % 2 == 0 and gpsro_iteration > dqn_iters: print("\n!!!!!writing txt!!!!\n") print(next_ars_param) num_pol = len(policies[0]) selector = [ np.append(still_nash_pol_ind, [num_pol - 2, num_pol - 1]) for _ in range(len(policies)) ] meta_game = [ele[np.ix_(*selector)] for ele in meta_game] beneficial_deviation, total_dev = print_beneficial_deviation_analysis( still_nash_meta_game, meta_game, still_nash_meta_prob) writer.add_text("ars_param", str(next_ars_param), global_step=(gpsro_iteration - dqn_iters) // 2) writer.add_scalar("beneficial_devs", sum(beneficial_deviation), (gpsro_iteration - dqn_iters) // 2) writer.add_scalar("total_devs", sum(total_dev), (gpsro_iteration - dqn_iters) // 2) for p in range(len(expl_per_player)): writer.add_scalar('player' + str(p) + '_exp', expl_per_player[p], gpsro_iteration) writer.add_scalar('exp', exploitabilities, gpsro_iteration) if FLAGS.verbose: print("Exploitabilities : {}".format(exploitabilities)) print("Exploitabilities per player : {}".format(expl_per_player))