def on_train_result(self, *, trainer, result: dict, **kwargs): result["scenario_name"] = trainer.scenario_name training_iteration = result["training_iteration"] super().on_train_result(trainer=trainer, result=result, **kwargs) if training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1: for player in range(2): checkpoint_metadata = create_metadata_with_new_checkpoint( policy_id_to_save=f"best_response_{player}", br_trainer=trainer, policy_player=player, save_dir=checkpoint_dir(trainer=trainer), timesteps_training=result["timesteps_total"], episodes_training=result["episodes_total"], checkpoint_name= f"best_response_player_{player}_iter_{training_iteration}.h5" ) joint_pol_checkpoint_spec = StrategySpec( strategy_id= f"best_response_player_{player}_iter_{training_iteration}", metadata=checkpoint_metadata) checkpoint_path = os.path.join( spec_checkpoint_dir(trainer), f"best_response_player_{player}_iter_{training_iteration}.json" ) ensure_dir(checkpoint_path) with open(checkpoint_path, "+w") as checkpoint_spec_file: checkpoint_spec_file.write( joint_pol_checkpoint_spec.to_json())
def claim_new_active_policy_for_player( self, player ) -> Union[Tuple[Dict[int, StrategySpec], Dict[int, List[StrategySpec]], int], Tuple[None, None, None]]: request = NXDOPlayer(player=player) response: NXDONewBestResponseParams = self._stub.ClaimNewActivePolicyForPlayer( request) if response.policy_num == -1: return None, None, None assert len(response.metanash_specs_for_players.policy_spec_list) in [ self.n_players(), 0 ] assert len( response.delegate_specs_for_players) in [self.n_players(), 0] metanash_json_specs_for_other_players = [ elem.policy_spec_json for elem in response.metanash_specs_for_players.policy_spec_list ] metanash_specs_for_players = { player: StrategySpec.from_json(json_spec) for player, json_spec in enumerate( metanash_json_specs_for_other_players) } delegate_json_spec_lists_for_other_players = [[ elem.policy_spec_json for elem in player_delegate_list.policy_spec_list ] for player_delegate_list in response.delegate_specs_for_players] delegate_specs_for_players = { player: [ StrategySpec.from_json(json_spec) for json_spec in player_delegate_json_list ] for player, player_delegate_json_list in enumerate( delegate_json_spec_lists_for_other_players) } if len(metanash_specs_for_players) == 0: metanash_specs_for_players = None if len(delegate_specs_for_players) == 0: delegate_specs_for_players = None return (metanash_specs_for_players, delegate_specs_for_players, response.policy_num)
def RequestExternalEval(self, request: EvalRequest, context): policy_specs_for_each_player = tuple( StrategySpec.from_json(json_string=json_string) for json_string in request.json_policy_specs_for_each_player) self._manager.request_external_eval( policy_specs_for_each_player=policy_specs_for_each_player) return Confirmation(result=True)
def take_eval_job(self) -> (Union[None, Tuple[StrategySpec]], int): response: EvalJob = self._stub.TakeEvalJob(Empty()) policy_specs_for_each_player = tuple( StrategySpec.from_json(json_string=json_string) for json_string in response.json_policy_specs_for_each_player) if len(policy_specs_for_each_player) == 0: return None, response.required_games_to_play return policy_specs_for_each_player, response.required_games_to_play
def SubmitEvalJobResult(self, request: EvalJobResult, context): policy_specs_for_each_player = tuple( StrategySpec.from_json(json_string=json_string) for json_string in request.json_policy_specs_for_each_player) self._eval_dispatcher.submit_eval_job_result( policy_specs_for_each_player_tuple=policy_specs_for_each_player, payoffs_for_each_player=request.payoffs_for_each_player, games_played=request.games_played) return EvalConfirmation(result=True)
def on_train_result(self, *, trainer, result: dict, **kwargs): super().on_train_result(trainer=trainer, result=result, **kwargs) result["scenario_name"] = trainer.scenario_name result["avg_br_reward_both_players"] = ray.get(trainer.avg_br_reward_deque.get_mean.remote()) training_iteration = result["training_iteration"] if (calculate_openspiel_metanash and (training_iteration == 1 or training_iteration % calc_metanash_every_n_iters == 0)): base_env = _create_env() open_spiel_env_config = base_env.open_spiel_env_config openspiel_game_version = base_env.game_version local_avg_policy_0 = trainer.workers.local_worker().policy_map["average_policy_0"] local_avg_policy_1 = trainer.workers.local_worker().policy_map["average_policy_1"] exploitability = nfsp_measure_exploitability_nonlstm( rllib_policies=[local_avg_policy_0, local_avg_policy_1], poker_game_version=openspiel_game_version, open_spiel_env_config=open_spiel_env_config ) result["avg_policy_exploitability"] = exploitability logger.info(colored( f"(Graph this in a notebook) Exploitability: {exploitability} - Saving exploitability stats " f"to {os.path.join(trainer.logdir, 'result.json')}", "green")) if checkpoint_every_n_iters and (training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1): for player in range(2): checkpoint_metadata = create_metadata_with_new_checkpoint( policy_id_to_save=f"average_policy_{player}", br_trainer=br_trainer, save_dir=checkpoint_dir(trainer=br_trainer), timesteps_training=result["timesteps_total"], episodes_training=result["episodes_total"], checkpoint_name=f"average_policy_player_{player}_iter_{training_iteration}.h5" ) avg_pol_checkpoint_spec = StrategySpec( strategy_id=f"avg_pol_player_{player}_iter_{training_iteration}", metadata=checkpoint_metadata) checkpoint_path = os.path.join(spec_checkpoint_dir(br_trainer), f"average_policy_player_{player}_iter_{training_iteration}.json") ensure_dir(checkpoint_path) with open(checkpoint_path, "+w") as checkpoint_spec_file: checkpoint_spec_file.write(avg_pol_checkpoint_spec.to_json())
def SubmitEmpiricalPayoffResult(self, request: PayoffResult, context): policy_specs_for_each_player = tuple( StrategySpec.from_json(json_string=json_string) for json_string in request.json_policy_specs_for_each_player) # noinspection PyTypeChecker self._manager.submit_empirical_payoff_result( policy_specs_for_each_player=policy_specs_for_each_player, payoffs_for_each_player=request.payoffs_for_each_player, games_played=request.games_played, override_all_previous_results=request.override_all_previous_results ) return Confirmation(result=True)
def claim_new_active_policy_for_player( self, player, new_policy_metadata_dict) -> StrategySpec: try: metadata_json = json.dumps(obj=new_policy_metadata_dict) except (TypeError, OverflowError) as json_err: raise ValueError( f"new_policy_metadata_dict must be JSON serializable." f"When attempting to serialize, got this error:\n{json_err}") request = NewActivePolicyRequest(player=player, metadata_json=metadata_json) response: PolicySpecJson = self._stub.ClaimNewActivePolicyForPlayer( request) return StrategySpec.from_json(response.policy_spec_json)
def add_new_pure_strategy(self, player, strategy_id, metadata=None) -> StrategySpec: with self._modification_lock: existing_spec: StrategySpec = self._strat_ids_to_specs.get( strategy_id) if existing_spec is not None: existing_spec.update_metadata(new_metadata=metadata) spec = existing_spec else: spec = StrategySpec(strategy_id=strategy_id, metadata=metadata) new_strategy_index = self._payoff_matrices_per_player[0].shape[ player] spec.assign_pure_strat_index(player=player, pure_strat_index=new_strategy_index) pad_size = 1 pad_axis = player npad = [(0, 0)] * self._payoff_matrices_per_player[0].ndim npad[pad_axis] = (0, pad_size) for i in range(self._n_players): self._payoff_matrices_per_player[i] = np.pad( self._payoff_matrices_per_player[i], pad_width=npad, mode='constant', constant_values=0) self._games_played_matrices_per_player[i] = np.pad( self._games_played_matrices_per_player[i], pad_width=npad, mode='constant', constant_values=0) self._player_and_strat_index_to_strat_id[( player, new_strategy_index)] = spec.id self._strat_ids_to_specs[strategy_id] = spec return spec
def set_active_policy_as_fixed(self, player, policy_num, final_metadata_dict) -> StrategySpec: try: metadata_json = json.dumps(obj=final_metadata_dict, cls=SafeFallbackJSONEncoder) except (TypeError, OverflowError) as json_err: raise ValueError( f"final_metadata_dict must be JSON serializable." f"When attempting to serialize, got this error:\n{json_err}") request = PolicyMetadataRequest(player=player, policy_num=policy_num, metadata_json=metadata_json) response: PolicySpecJson = self._stub.SetActivePolicyAsFixed(request) return StrategySpec.from_json(response.policy_spec_json)
def submit_new_active_policy_metadata(self, player, policy_num, metadata_dict) -> StrategySpec: try: metadata_json = json.dumps(obj=metadata_dict) except (TypeError, OverflowError) as json_err: raise ValueError( f"metadata_dict must be JSON serializable." f"When attempting to serialize, got this error:\n{json_err}") request = PolicyMetadataRequest(player=player, policy_num=policy_num, metadata_json=metadata_json) response: PolicySpecJson = self._stub.SubmitNewActivePolicyMetadata( request) return StrategySpec.from_json(response.policy_spec_json)
def from_json_string(cls, json_string): json_dict = json.loads(s=json_string) strat_ids_to_specs = { strat_id: StrategySpec.from_dict(serialized_dict=spec_dict) for strat_id, spec_dict in json_dict["strat_ids_to_specs"].items() } player_and_strat_index_to_strat_ids = { string_to_int_tuple(s=player_and_strat_index): strat_id for player_and_strat_index, strat_id in json_dict["player_and_strat_index_to_strat_id"].items() } return PayoffTable( n_players=json_dict["n_players"], exponential_average_coeff=json_dict["exponential_average_coeff"], restore_strat_ids_to_specs=strat_ids_to_specs, restore_player_and_strat_index_to_strat_ids= player_and_strat_index_to_strat_ids, restore_payoff_matrices_per_player=json_dict[ "payoff_matrices_per_player"], restore_games_played_matrices_per_player=json_dict[ "games_played_matrices_per_player"])
ignore_reinit_error=True, logging_level=logging.INFO, log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False)) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config( tmp_env)["model"] player_0_avg_pol_spec = StrategySpec.from_json_file( "/home/jblanier/git/grl/grl/data/1000_oshi_zumo_nfsp_larger_dqn_larger_sparse_10.56.11PM_Mar-24-20217lav0isx/avg_policy_checkpoint_specs/average_policy_player_0_iter_53000.json" ) class HyperParamSearchCallbacks(DefaultCallbacks): def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker,
ignore_reinit_error=True, logging_level=logging.INFO, log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False)) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(tmp_env)["model"] player_0_avg_pol_spec = StrategySpec.from_json_file( "/home/jblanier/git/grl/grl/data/loss_game_nfsp_10_moves_alpha_2.9_sparse_12.07.15AM_May-18-202120bfveou/avg_policy_checkpoint_specs/average_policy_player_0_iter_30000.json" ) class HyperParamSearchCallbacks(DefaultCallbacks): def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded: avg_policy = worker.policy_map["average_policy"] load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec) worker.avg_pol_loaded = True def on_train_result(self, *, trainer, result: dict, **kwargs):
def submit_final_br_policy(self, player, policy_num, metadata_dict): with self.modification_lock: if player < 0 or player >= self._n_players: raise ValueError( f"player {player} is out of range. Must be in [0, n_players)." ) if policy_num != self._current_double_oracle_iteration: raise ValueError( f"Policy {policy_num} isn't the same as the current double oracle iteration " f"{self._current_double_oracle_iteration}.") br_policy_spec: StrategySpec = StrategySpec( strategy_id=self._strat_id(player=player, policy_num=policy_num), metadata=metadata_dict, pure_strategy_indexes={player: policy_num}) self._br_episodes_this_iter += metadata_dict[ "episodes_training_br"] self._br_timesteps_this_iter += metadata_dict[ "timesteps_training_br"] self._next_iter_br_spec_lists_for_each_player[player].append( br_policy_spec) self._player_brs_are_finished_this_iter[player] = True all_players_finished_brs_this_ter = all( self._player_brs_are_finished_this_iter.values()) if all_players_finished_brs_this_ter: print("Solving restricted game") restricted_game_solve_result = self._solve_restricted_game( log_dir=self.log_dir, br_spec_lists_for_each_player=self. _next_iter_br_spec_lists_for_each_player, manager_metadata=self.get_manager_metadata()) self._latest_metanash_spec_for_each_player = restricted_game_solve_result.latest_metanash_spec_for_each_player self._restricted_game_episodes_this_iter += restricted_game_solve_result.episodes_spent_in_solve self._restricted_game_timesteps_this_iter += restricted_game_solve_result.timesteps_spent_in_solve self._episodes_total += ( self._br_episodes_this_iter + self._restricted_game_episodes_this_iter) self._timesteps_total += ( self._br_timesteps_this_iter + self._restricted_game_timesteps_this_iter) br_specs_added_this_iter = { player: player_br_spec_list[-1] for player, player_br_spec_list in self._next_iter_br_spec_lists_for_each_player.items() } data_to_log = { "episodes_total": self._episodes_total, "timesteps_total": self._timesteps_total, "br_episodes_this_iter": self._br_episodes_this_iter, "br_timesteps_this_iter": self._br_timesteps_this_iter, "restricted_game_episodes_this_iter": self._restricted_game_episodes_this_iter, "restricted_game_timesteps_this_iter": self._restricted_game_timesteps_this_iter, "br_specs_added_this_iter": { player: spec.to_json() for player, spec in br_specs_added_this_iter.items() }, "metanash_specs": [ spec.to_json() for spec in self._latest_metanash_spec_for_each_player ], } if all("average_br_reward" in br_spec.metadata for br_spec in br_specs_added_this_iter.values()): data_to_log["player_br_rewards_vs_previous_metanash"] = { player: br_spec.metadata["average_br_reward"] for player, br_spec in br_specs_added_this_iter.items() } assert "episodes_total" not in restricted_game_solve_result.extra_data_to_log assert "timesteps_total" not in restricted_game_solve_result.extra_data_to_log data_to_log.update( restricted_game_solve_result.extra_data_to_log) with open(self._json_log_path, "+a") as json_file: json_file.writelines([json.dumps(data_to_log) + '\n']) print( colored( f"(Graph this in a notebook) Saved manager stats (including exploitability if applicable) " f"to {self._json_log_path}", "green")) for checkpoint_player, player_metanash_spec in enumerate( restricted_game_solve_result. latest_metanash_spec_for_each_player): checkpoint_path = os.path.join( self.log_dir, "xfdo_metanash_specs", f"{checkpoint_player}_metanash_{self._current_double_oracle_iteration}.json" ) ensure_dir(checkpoint_path) with open(checkpoint_path, "+w") as checkpoint_spec_file: checkpoint_spec_file.write( player_metanash_spec.to_json()) # Start the next double oracle iteration here. # A double oracle iteration is considered to be training BRs # followed by solving the new restricted game. self._current_double_oracle_iteration += 1 self._br_episodes_this_iter = 0 self._br_timesteps_this_iter = 0 self._restricted_game_episodes_this_iter = 0 self._restricted_game_timesteps_this_iter = 0 self._player_brs_are_finished_this_iter = { p: False for p in range(self._n_players) } self._br_spec_lists_for_each_player = deepcopy( self._next_iter_br_spec_lists_for_each_player)
def train_off_policy_rl_nfsp_restricted_game(results_dir: str, scenario: NXDOScenario, player_to_base_game_action_specs: Dict[int, List[StrategySpec]], stopping_condition: StoppingCondition, manager_metadata: Union[dict, None], print_train_results: bool = True): use_openspiel_restricted_game: bool = scenario.use_openspiel_restricted_game get_restricted_game_custom_model = scenario.get_restricted_game_custom_model env_class = scenario.env_class base_env_config = scenario.env_config trainer_class = scenario.trainer_class_nfsp avg_trainer_class = scenario.avg_trainer_class_nfsp policy_classes: Dict[str, Type[Policy]] = scenario.policy_classes_nfsp anticipatory_param: float = scenario.anticipatory_param_nfsp get_trainer_config = scenario.get_trainer_config_nfsp get_avg_trainer_config = scenario.get_avg_trainer_config_nfsp get_trainer_config_br = scenario.get_trainer_config_br calculate_openspiel_metanash: bool = scenario.calculate_openspiel_metanash calculate_openspiel_metanash_at_end: bool = scenario.calculate_openspiel_metanash_at_end calc_metanash_every_n_iters: int = scenario.calc_metanash_every_n_iters should_log_result_fn = scenario.ray_should_log_result_filter metrics_smoothing_episodes_override: int = scenario.metanash_metrics_smoothing_episodes_override assert scenario.xdo_metanash_method == "nfsp" ray_head_address = manager_metadata.get("ray_head_address", None) if manager_metadata is not None else None init_ray_for_scenario(scenario=scenario, head_address=ray_head_address, logging_level=logging.INFO) def select_policy(agent_id): random_sample = np.random.random() if agent_id == 0: if random_sample < anticipatory_param: return "best_response_0" return "average_policy_0" elif agent_id == 1: if random_sample < anticipatory_param: return "best_response_1" return "average_policy_1" else: raise ValueError(f"unexpected agent_id: {agent_id}") def assert_not_called(agent_id): assert False, "This function should never be called." def _create_base_env(): return env_class(env_config=base_env_config) tmp_base_env = _create_base_env() restricted_env_config = {"create_env_fn": _create_base_env} if use_openspiel_restricted_game: restricted_game_class = OpenSpielRestrictedGame tmp_env = restricted_game_class(env_config=restricted_env_config) restricted_game_action_spaces = [tmp_env.base_action_space for _ in range(2)] else: restricted_game_class = RestrictedGame restricted_env_config["use_delegate_policy_exploration"] = scenario.allow_stochastic_best_responses tmp_env = restricted_game_class(env_config=restricted_env_config) restricted_game_action_spaces = [Discrete(n=len(player_to_base_game_action_specs[p])) for p in range(2)] assert all(restricted_game_action_spaces[0] == space for space in restricted_game_action_spaces) print(f"\n\n\n\n\nRestricted game action spaces {restricted_game_action_spaces}\n\n\n\n\n\n") scenario_avg_trainer_config = get_avg_trainer_config(tmp_base_env) scenario_avg_trainer_config_exploration_config = scenario_avg_trainer_config.get("exploration_config", {}) if scenario_avg_trainer_config_exploration_config: del scenario_avg_trainer_config["exploration_config"] scenario_trainer_config = get_trainer_config(tmp_base_env) scenario_trainer_config_exploration_config = scenario_trainer_config.get("exploration_config", {}) if scenario_trainer_config_exploration_config: del scenario_trainer_config["exploration_config"] delegate_policy_config = merge_dicts(get_trainer_config_br(tmp_base_env), {"explore": scenario.allow_stochastic_best_responses}) avg_trainer_config = merge_dicts({ "log_level": "DEBUG", "framework": "torch", "env": restricted_game_class, "env_config": restricted_env_config, "num_gpus": 0.0, "num_gpus_per_worker": 0.0, "num_workers": 0, "num_envs_per_worker": 1, "multiagent": { "policies_to_train": ["average_policy_0", "average_policy_1"], "policies": { "average_policy_0": ( policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[0], {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}), "average_policy_1": ( policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[1], {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}), "delegate_policy": ( policy_classes["delegate_policy"], tmp_base_env.observation_space, tmp_env.base_action_space, delegate_policy_config), }, "policy_mapping_fn": assert_not_called, }, }, scenario_avg_trainer_config) for _policy_id in ["average_policy_0", "average_policy_1"]: if get_restricted_game_custom_model is not None: avg_trainer_config["multiagent"]["policies"][_policy_id][3]["model"] = { "custom_model": get_restricted_game_custom_model(tmp_env)} avg_trainer = avg_trainer_class(config=avg_trainer_config, logger_creator=get_trainer_logger_creator( base_dir=results_dir, scenario_name=f"nfsp_restricted_game_avg_trainer", should_log_result_fn=should_log_result_fn)) store_to_avg_policy_buffer = get_store_to_avg_policy_buffer_fn(nfsp_trainer=avg_trainer) class NFSPBestResponseCallbacks(DefaultCallbacks): def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID, policy_id: PolicyID, policies: Dict[PolicyID, Policy], postprocessed_batch: SampleBatch, original_batches: Dict[Any, Tuple[Policy, SampleBatch]], **kwargs): super().on_postprocess_trajectory(worker=worker, episode=episode, agent_id=agent_id, policy_id=policy_id, policies=policies, postprocessed_batch=postprocessed_batch, original_batches=original_batches, **kwargs) postprocessed_batch.data["source_policy"] = [policy_id] * len(postprocessed_batch.data["rewards"]) # All data from both policies will go into the best response's replay buffer. # Here we ensure policies not from the best response have the exact same preprocessing as the best response. for average_policy_id, br_policy_id in [("average_policy_0", "best_response_0"), ("average_policy_1", "best_response_1")]: if policy_id == average_policy_id: if "action_probs" in postprocessed_batch: del postprocessed_batch.data["action_probs"] if "behaviour_logits" in postprocessed_batch: del postprocessed_batch.data["behaviour_logits"] br_policy: Policy = policies[br_policy_id] new_batch = br_policy.postprocess_trajectory( sample_batch=postprocessed_batch, other_agent_batches=original_batches, episode=episode) copy_attributes(src_obj=new_batch, dst_obj=postprocessed_batch) elif policy_id == br_policy_id: if "q_values" in postprocessed_batch: del postprocessed_batch.data["q_values"] if "action_probs" in postprocessed_batch: del postprocessed_batch.data["action_probs"] del postprocessed_batch.data["action_dist_inputs"] if policy_id in ("average_policy_0", "best_response_0"): assert agent_id == 0 if policy_id in ("average_policy_1", "best_response_1"): assert agent_id == 1 def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwargs): super().on_sample_end(worker=worker, samples=samples, **kwargs) assert isinstance(samples, MultiAgentBatch) for policy_samples in samples.policy_batches.values(): if "action_prob" in policy_samples.data: del policy_samples.data["action_prob"] if "action_logp" in policy_samples.data: del policy_samples.data["action_logp"] for average_policy_id, br_policy_id in [("average_policy_0", "best_response_0"), ("average_policy_1", "best_response_1")]: for policy_id, policy_samples in samples.policy_batches.items(): if policy_id == br_policy_id: store_to_avg_policy_buffer(MultiAgentBatch(policy_batches={ average_policy_id: policy_samples }, env_steps=policy_samples.count)) if average_policy_id in samples.policy_batches: if br_policy_id in samples.policy_batches: all_policies_samples = samples.policy_batches[br_policy_id].concat( other=samples.policy_batches[average_policy_id]) else: all_policies_samples = samples.policy_batches[average_policy_id] del samples.policy_batches[average_policy_id] samples.policy_batches[br_policy_id] = all_policies_samples def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) episode_policies = set(episode.agent_rewards.keys()) if episode_policies == {(0, "average_policy_0"), (1, "best_response_1")}: worker.avg_br_reward_deque.add.remote(episode.agent_rewards[(1, "best_response_1")]) elif episode_policies == {(1, "average_policy_1"), (0, "best_response_0")}: worker.avg_br_reward_deque.add.remote(episode.agent_rewards[(0, "best_response_0")]) def on_train_result(self, *, trainer, result: dict, **kwargs): super().on_train_result(trainer=trainer, result=result, **kwargs) training_iteration = result["training_iteration"] result["avg_br_reward_both_players"] = ray.get(trainer.avg_br_reward_deque.get_mean.remote()) if (calculate_openspiel_metanash and (training_iteration == 1 or training_iteration % calc_metanash_every_n_iters == 0)): base_env = _create_base_env() open_spiel_env_config = base_env.open_spiel_env_config openspiel_game_version = base_env.game_version local_avg_policy_0 = trainer.workers.local_worker().policy_map["average_policy_0"] local_avg_policy_1 = trainer.workers.local_worker().policy_map["average_policy_1"] exploitability = nxdo_nfsp_measure_exploitability_nonlstm( rllib_policies=[local_avg_policy_0, local_avg_policy_1], poker_game_version=openspiel_game_version, restricted_game_convertors=trainer.get_local_converters(), open_spiel_env_config=open_spiel_env_config, use_delegate_policy_exploration=scenario.allow_stochastic_best_responses ) result["avg_policy_exploitability"] = exploitability br_trainer_config = { "log_level": "DEBUG", "callbacks": NFSPBestResponseCallbacks, "env": restricted_game_class, "env_config": restricted_env_config, "gamma": 1.0, "num_gpus": 0.0, "num_workers": 0, "num_gpus_per_worker": 0.0, "num_envs_per_worker": 1, "multiagent": { "policies_to_train": ["best_response_0", "best_response_1"], "policies": { "average_policy_0": ( policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[0], {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}), "best_response_0": ( policy_classes["best_response"], tmp_env.observation_space, restricted_game_action_spaces[0], {"exploration_config": scenario_trainer_config_exploration_config}), "average_policy_1": ( policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[1], {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}), "best_response_1": ( policy_classes["best_response"], tmp_env.observation_space, restricted_game_action_spaces[1], {"exploration_config": scenario_trainer_config_exploration_config}), "delegate_policy": ( policy_classes["delegate_policy"], tmp_base_env.observation_space, tmp_env.base_action_space, delegate_policy_config), }, "policy_mapping_fn": select_policy, }, } assert all(restricted_game_action_spaces[0] == space for space in restricted_game_action_spaces), \ "If not true, the line below with \"get_trainer_config\" may need to be changed to a better solution." br_trainer_config = merge_dicts(br_trainer_config, scenario_trainer_config) for _policy_id in ["average_policy_0", "average_policy_1", "best_response_0", "best_response_1"]: if get_restricted_game_custom_model is not None: br_trainer_config["multiagent"]["policies"][_policy_id][3]["model"] = { "custom_model": get_restricted_game_custom_model(tmp_env)} br_trainer_config["metrics_smoothing_episodes"] = metrics_smoothing_episodes_override br_trainer = trainer_class(config=br_trainer_config, logger_creator=get_trainer_logger_creator( base_dir=results_dir, scenario_name="nfsp_restricted_game_trainer", should_log_result_fn=should_log_result_fn)) avg_br_reward_deque = StatDeque.remote(max_items=br_trainer_config["metrics_smoothing_episodes"]) def _set_avg_br_rew_deque(worker: RolloutWorker): worker.avg_br_reward_deque = avg_br_reward_deque br_trainer.workers.foreach_worker(_set_avg_br_rew_deque) br_trainer.avg_br_reward_deque = avg_br_reward_deque if use_openspiel_restricted_game: local_delegate_policy = br_trainer.workers.local_worker().policy_map["delegate_policy"] player_converters = [] for p in range(2): print("Creating restricted game obs conversions...") convertor = get_restricted_game_obs_conversions(player=p, delegate_policy=local_delegate_policy, policy_specs=player_to_base_game_action_specs[p], load_policy_spec_fn=create_get_pure_strat_cached(cache={}), tmp_base_env=tmp_base_env) player_converters.append(convertor) for _trainer in [br_trainer, avg_trainer]: def _set_worker_converters(worker: RolloutWorker): worker_delegate_policy = worker.policy_map["delegate_policy"] for p in range(2): worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converters[p])) worker_delegate_policy.player_converters = player_converters _trainer.workers.foreach_worker(_set_worker_converters) _trainer.get_local_converters = lambda: _trainer.workers.local_worker().policy_map[ "delegate_policy"].player_converters else: weights_cache = {} for _trainer in [br_trainer, avg_trainer]: def _set_worker_converters(worker: RolloutWorker): worker_delegate_policy = worker.policy_map["delegate_policy"] player_converters = [] for p in range(2): player_converter = RestrictedToBaseGameActionSpaceConverter( delegate_policy=worker_delegate_policy, policy_specs=player_to_base_game_action_specs[p], load_policy_spec_fn=create_get_pure_strat_cached(cache=weights_cache)) player_converters.append(player_converter) worker.foreach_env(lambda env: env.set_action_conversion(p, player_converter)) worker_delegate_policy.player_converters = player_converters _trainer.workers.foreach_worker(_set_worker_converters) _trainer.get_local_converters = lambda: _trainer.workers.local_worker().policy_map[ "delegate_policy"].player_converters br_trainer.latest_avg_trainer_result = None train_iter_count = 0 for _trainer in [br_trainer, avg_trainer]: for policy_id, policy in _trainer.workers.local_worker().policy_map.items(): policy.policy_id = policy_id if len(player_to_base_game_action_specs[0]) == 1: final_train_result = {"episodes_total": 0, "timesteps_total": 0, "training_iteration": 0} tmp_callback = NFSPBestResponseCallbacks() tmp_callback.on_train_result(trainer=br_trainer, result=final_train_result) else: avg_weights = avg_trainer.get_weights(["average_policy_0", "average_policy_1"]) br_trainer.workers.foreach_worker(lambda worker: worker.set_weights(avg_weights)) while True: avg_train_results = avg_trainer.train() avg_weights = avg_trainer.get_weights(["average_policy_0", "average_policy_1"]) br_trainer.workers.foreach_worker(lambda worker: worker.set_weights(avg_weights)) br_trainer.latest_avg_trainer_result = copy.deepcopy(avg_train_results) train_iter_results = br_trainer.train() # do a step (or several) in the main RL loop train_iter_count += 1 if print_train_results: # Delete verbose debugging info before printing if "hist_stats" in train_iter_results: del train_iter_results["hist_stats"] if "td_error" in train_iter_results["info"]["learner"]["best_response_0"]: del train_iter_results["info"]["learner"]["best_response_0"]["td_error"] if "td_error" in train_iter_results["info"]["learner"]["best_response_1"]: del train_iter_results["info"]["learner"]["best_response_1"]["td_error"] print(pretty_dict_str(train_iter_results)) print(f"Trainer logdir is {br_trainer.logdir}") if stopping_condition.should_stop_this_iter(latest_trainer_result=train_iter_results): print("stopping condition met.") final_train_result = deepcopy(train_iter_results) break if calculate_openspiel_metanash_at_end: base_env = _create_base_env() open_spiel_env_config = base_env.open_spiel_env_config openspiel_game_version = base_env.game_version local_avg_policy_0 = br_trainer.workers.local_worker().policy_map["average_policy_0"] local_avg_policy_1 = br_trainer.workers.local_worker().policy_map["average_policy_1"] exploitability = nxdo_nfsp_measure_exploitability_nonlstm( rllib_policies=[local_avg_policy_0, local_avg_policy_1], poker_game_version=openspiel_game_version, restricted_game_convertors=br_trainer.get_local_converters(), open_spiel_env_config=open_spiel_env_config, use_delegate_policy_exploration=scenario.allow_stochastic_best_responses ) final_train_result["avg_policy_exploitability"] = exploitability if "avg_policy_exploitability" in final_train_result: print(f"\n\nexploitability: {final_train_result['avg_policy_exploitability']}\n\n") avg_policy_specs = [] for player in range(2): strategy_id = f"avg_policy_player_{player}_{datetime_str()}" checkpoint_path = save_nfsp_avg_policy_checkpoint(trainer=br_trainer, policy_id_to_save=f"average_policy_{player}", save_dir=checkpoint_dir(trainer=br_trainer), timesteps_training=final_train_result["timesteps_total"], episodes_training=final_train_result["episodes_total"], checkpoint_name=f"{strategy_id}.h5") avg_policy_spec = StrategySpec( strategy_id=strategy_id, metadata={"checkpoint_path": checkpoint_path, "delegate_policy_specs": [spec.to_json() for spec in player_to_base_game_action_specs[player]] }) avg_policy_specs.append(avg_policy_spec) ray.kill(avg_trainer.workers.local_worker().replay_buffer_actor) avg_trainer.cleanup() br_trainer.cleanup() del avg_trainer del br_trainer del avg_br_reward_deque time.sleep(10) assert final_train_result is not None return avg_policy_specs, final_train_result
dashboard_port=find_free_port(), ignore_reinit_error=True, logging_level=logging.INFO, log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False)) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(tmp_env)["model"] player_0_avg_pol_spec = StrategySpec.from_json_file( "/home/jblanier/git/grl/grl/data/oshi_zumo_tiny_nfsp_dqn_sparse_01.54.50PM_Apr-08-20218z_hf4wq/avg_policy_checkpoint_specs/average_policy_player_0_iter_214000.json") class HyperParamSearchCallbacks(DefaultCallbacks): def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded: avg_policy = worker.policy_map["average_policy"] load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec) worker.avg_pol_loaded = True def on_train_result(self, *, trainer, result: dict, **kwargs): super().on_train_result(trainer=trainer, result=result, **kwargs)
ignore_reinit_error=True, logging_level=logging.INFO, log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False)) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config( tmp_env)["model"] player_0_avg_pol_spec = StrategySpec.from_json_file( "/home/jblanier/git/grl/grl/data/oshi_zumo_medium_nfsp_dqn_sparse_01.55.05PM_Apr-08-2021ta6arraq/avg_policy_checkpoint_specs/average_policy_player_0_iter_221000.json" ) class HyperParamSearchCallbacks(DefaultCallbacks): def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker,
ignore_reinit_error=True, logging_level=logging.INFO, log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False)) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config( tmp_env)["model"] player_0_avg_pol_spec = StrategySpec.from_json_file( "/home/jblanier/git/grl/grl/data/leduc_nfsp_dqn_sparse_02.34.06PM_Apr-08-2021bt5ym0l8/avg_policy_checkpoint_specs/average_policy_player_0_iter_263000.json" ) class HyperParamSearchCallbacks(DefaultCallbacks): def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker,