コード例 #1
0
ファイル: simple_self_play.py プロジェクト: indylab/nxdo
        def on_train_result(self, *, trainer, result: dict, **kwargs):
            result["scenario_name"] = trainer.scenario_name
            training_iteration = result["training_iteration"]
            super().on_train_result(trainer=trainer, result=result, **kwargs)

            if training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1:
                for player in range(2):
                    checkpoint_metadata = create_metadata_with_new_checkpoint(
                        policy_id_to_save=f"best_response_{player}",
                        br_trainer=trainer,
                        policy_player=player,
                        save_dir=checkpoint_dir(trainer=trainer),
                        timesteps_training=result["timesteps_total"],
                        episodes_training=result["episodes_total"],
                        checkpoint_name=
                        f"best_response_player_{player}_iter_{training_iteration}.h5"
                    )
                    joint_pol_checkpoint_spec = StrategySpec(
                        strategy_id=
                        f"best_response_player_{player}_iter_{training_iteration}",
                        metadata=checkpoint_metadata)
                    checkpoint_path = os.path.join(
                        spec_checkpoint_dir(trainer),
                        f"best_response_player_{player}_iter_{training_iteration}.json"
                    )
                    ensure_dir(checkpoint_path)
                    with open(checkpoint_path, "+w") as checkpoint_spec_file:
                        checkpoint_spec_file.write(
                            joint_pol_checkpoint_spec.to_json())
コード例 #2
0
    def claim_new_active_policy_for_player(
        self, player
    ) -> Union[Tuple[Dict[int, StrategySpec], Dict[int, List[StrategySpec]],
                     int], Tuple[None, None, None]]:
        request = NXDOPlayer(player=player)
        response: NXDONewBestResponseParams = self._stub.ClaimNewActivePolicyForPlayer(
            request)

        if response.policy_num == -1:
            return None, None, None

        assert len(response.metanash_specs_for_players.policy_spec_list) in [
            self.n_players(), 0
        ]
        assert len(
            response.delegate_specs_for_players) in [self.n_players(), 0]

        metanash_json_specs_for_other_players = [
            elem.policy_spec_json
            for elem in response.metanash_specs_for_players.policy_spec_list
        ]

        metanash_specs_for_players = {
            player: StrategySpec.from_json(json_spec)
            for player, json_spec in enumerate(
                metanash_json_specs_for_other_players)
        }

        delegate_json_spec_lists_for_other_players = [[
            elem.policy_spec_json
            for elem in player_delegate_list.policy_spec_list
        ] for player_delegate_list in response.delegate_specs_for_players]
        delegate_specs_for_players = {
            player: [
                StrategySpec.from_json(json_spec)
                for json_spec in player_delegate_json_list
            ]
            for player, player_delegate_json_list in enumerate(
                delegate_json_spec_lists_for_other_players)
        }

        if len(metanash_specs_for_players) == 0:
            metanash_specs_for_players = None

        if len(delegate_specs_for_players) == 0:
            delegate_specs_for_players = None

        return (metanash_specs_for_players, delegate_specs_for_players,
                response.policy_num)
コード例 #3
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def RequestExternalEval(self, request: EvalRequest, context):
     policy_specs_for_each_player = tuple(
         StrategySpec.from_json(json_string=json_string)
         for json_string in request.json_policy_specs_for_each_player)
     self._manager.request_external_eval(
         policy_specs_for_each_player=policy_specs_for_each_player)
     return Confirmation(result=True)
コード例 #4
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def take_eval_job(self) -> (Union[None, Tuple[StrategySpec]], int):
     response: EvalJob = self._stub.TakeEvalJob(Empty())
     policy_specs_for_each_player = tuple(
         StrategySpec.from_json(json_string=json_string)
         for json_string in response.json_policy_specs_for_each_player)
     if len(policy_specs_for_each_player) == 0:
         return None, response.required_games_to_play
     return policy_specs_for_each_player, response.required_games_to_play
コード例 #5
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def SubmitEvalJobResult(self, request: EvalJobResult, context):
     policy_specs_for_each_player = tuple(
         StrategySpec.from_json(json_string=json_string)
         for json_string in request.json_policy_specs_for_each_player)
     self._eval_dispatcher.submit_eval_job_result(
         policy_specs_for_each_player_tuple=policy_specs_for_each_player,
         payoffs_for_each_player=request.payoffs_for_each_player,
         games_played=request.games_played)
     return EvalConfirmation(result=True)
コード例 #6
0
        def on_train_result(self, *, trainer, result: dict, **kwargs):
            super().on_train_result(trainer=trainer, result=result, **kwargs)
            result["scenario_name"] = trainer.scenario_name
            result["avg_br_reward_both_players"] = ray.get(trainer.avg_br_reward_deque.get_mean.remote())

            training_iteration = result["training_iteration"]
            if (calculate_openspiel_metanash and
                    (training_iteration == 1 or training_iteration % calc_metanash_every_n_iters == 0)):
                base_env = _create_env()
                open_spiel_env_config = base_env.open_spiel_env_config
                openspiel_game_version = base_env.game_version
                local_avg_policy_0 = trainer.workers.local_worker().policy_map["average_policy_0"]
                local_avg_policy_1 = trainer.workers.local_worker().policy_map["average_policy_1"]
                exploitability = nfsp_measure_exploitability_nonlstm(
                    rllib_policies=[local_avg_policy_0, local_avg_policy_1],
                    poker_game_version=openspiel_game_version,
                    open_spiel_env_config=open_spiel_env_config
                )
                result["avg_policy_exploitability"] = exploitability
                logger.info(colored(
                    f"(Graph this in a notebook) Exploitability: {exploitability} - Saving exploitability stats "
                    f"to {os.path.join(trainer.logdir, 'result.json')}", "green"))

            if checkpoint_every_n_iters and (training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1):
                for player in range(2):
                    checkpoint_metadata = create_metadata_with_new_checkpoint(
                        policy_id_to_save=f"average_policy_{player}",
                        br_trainer=br_trainer,
                        save_dir=checkpoint_dir(trainer=br_trainer),
                        timesteps_training=result["timesteps_total"],
                        episodes_training=result["episodes_total"],
                        checkpoint_name=f"average_policy_player_{player}_iter_{training_iteration}.h5"
                    )
                    avg_pol_checkpoint_spec = StrategySpec(
                        strategy_id=f"avg_pol_player_{player}_iter_{training_iteration}",
                        metadata=checkpoint_metadata)
                    checkpoint_path = os.path.join(spec_checkpoint_dir(br_trainer),
                                                   f"average_policy_player_{player}_iter_{training_iteration}.json")
                    ensure_dir(checkpoint_path)
                    with open(checkpoint_path, "+w") as checkpoint_spec_file:
                        checkpoint_spec_file.write(avg_pol_checkpoint_spec.to_json())
コード例 #7
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def SubmitEmpiricalPayoffResult(self, request: PayoffResult, context):
     policy_specs_for_each_player = tuple(
         StrategySpec.from_json(json_string=json_string)
         for json_string in request.json_policy_specs_for_each_player)
     # noinspection PyTypeChecker
     self._manager.submit_empirical_payoff_result(
         policy_specs_for_each_player=policy_specs_for_each_player,
         payoffs_for_each_player=request.payoffs_for_each_player,
         games_played=request.games_played,
         override_all_previous_results=request.override_all_previous_results
     )
     return Confirmation(result=True)
コード例 #8
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def claim_new_active_policy_for_player(
         self, player, new_policy_metadata_dict) -> StrategySpec:
     try:
         metadata_json = json.dumps(obj=new_policy_metadata_dict)
     except (TypeError, OverflowError) as json_err:
         raise ValueError(
             f"new_policy_metadata_dict must be JSON serializable."
             f"When attempting to serialize, got this error:\n{json_err}")
     request = NewActivePolicyRequest(player=player,
                                      metadata_json=metadata_json)
     response: PolicySpecJson = self._stub.ClaimNewActivePolicyForPlayer(
         request)
     return StrategySpec.from_json(response.policy_spec_json)
コード例 #9
0
ファイル: payoff_table.py プロジェクト: indylab/nxdo
    def add_new_pure_strategy(self,
                              player,
                              strategy_id,
                              metadata=None) -> StrategySpec:
        with self._modification_lock:
            existing_spec: StrategySpec = self._strat_ids_to_specs.get(
                strategy_id)
            if existing_spec is not None:
                existing_spec.update_metadata(new_metadata=metadata)
                spec = existing_spec
            else:
                spec = StrategySpec(strategy_id=strategy_id, metadata=metadata)

            new_strategy_index = self._payoff_matrices_per_player[0].shape[
                player]
            spec.assign_pure_strat_index(player=player,
                                         pure_strat_index=new_strategy_index)

            pad_size = 1
            pad_axis = player
            npad = [(0, 0)] * self._payoff_matrices_per_player[0].ndim
            npad[pad_axis] = (0, pad_size)

            for i in range(self._n_players):
                self._payoff_matrices_per_player[i] = np.pad(
                    self._payoff_matrices_per_player[i],
                    pad_width=npad,
                    mode='constant',
                    constant_values=0)
                self._games_played_matrices_per_player[i] = np.pad(
                    self._games_played_matrices_per_player[i],
                    pad_width=npad,
                    mode='constant',
                    constant_values=0)

            self._player_and_strat_index_to_strat_id[(
                player, new_strategy_index)] = spec.id
            self._strat_ids_to_specs[strategy_id] = spec
            return spec
コード例 #10
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def set_active_policy_as_fixed(self, player, policy_num,
                                final_metadata_dict) -> StrategySpec:
     try:
         metadata_json = json.dumps(obj=final_metadata_dict,
                                    cls=SafeFallbackJSONEncoder)
     except (TypeError, OverflowError) as json_err:
         raise ValueError(
             f"final_metadata_dict must be JSON serializable."
             f"When attempting to serialize, got this error:\n{json_err}")
     request = PolicyMetadataRequest(player=player,
                                     policy_num=policy_num,
                                     metadata_json=metadata_json)
     response: PolicySpecJson = self._stub.SetActivePolicyAsFixed(request)
     return StrategySpec.from_json(response.policy_spec_json)
コード例 #11
0
ファイル: remote.py プロジェクト: indylab/nxdo
 def submit_new_active_policy_metadata(self, player, policy_num,
                                       metadata_dict) -> StrategySpec:
     try:
         metadata_json = json.dumps(obj=metadata_dict)
     except (TypeError, OverflowError) as json_err:
         raise ValueError(
             f"metadata_dict must be JSON serializable."
             f"When attempting to serialize, got this error:\n{json_err}")
     request = PolicyMetadataRequest(player=player,
                                     policy_num=policy_num,
                                     metadata_json=metadata_json)
     response: PolicySpecJson = self._stub.SubmitNewActivePolicyMetadata(
         request)
     return StrategySpec.from_json(response.policy_spec_json)
コード例 #12
0
ファイル: payoff_table.py プロジェクト: indylab/nxdo
 def from_json_string(cls, json_string):
     json_dict = json.loads(s=json_string)
     strat_ids_to_specs = {
         strat_id: StrategySpec.from_dict(serialized_dict=spec_dict)
         for strat_id, spec_dict in json_dict["strat_ids_to_specs"].items()
     }
     player_and_strat_index_to_strat_ids = {
         string_to_int_tuple(s=player_and_strat_index): strat_id
         for player_and_strat_index, strat_id in
         json_dict["player_and_strat_index_to_strat_id"].items()
     }
     return PayoffTable(
         n_players=json_dict["n_players"],
         exponential_average_coeff=json_dict["exponential_average_coeff"],
         restore_strat_ids_to_specs=strat_ids_to_specs,
         restore_player_and_strat_index_to_strat_ids=
         player_and_strat_index_to_strat_ids,
         restore_payoff_matrices_per_player=json_dict[
             "payoff_matrices_per_player"],
         restore_games_played_matrices_per_player=json_dict[
             "games_played_matrices_per_player"])
コード例 #13
0
                            ignore_reinit_error=True,
                            logging_level=logging.INFO,
                            log_to_driver=os.getenv("RAY_LOG_TO_DRIVER",
                                                    False))

    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"

    avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(
        tmp_env)["model"]

    player_0_avg_pol_spec = StrategySpec.from_json_file(
        "/home/jblanier/git/grl/grl/data/1000_oshi_zumo_nfsp_larger_dqn_larger_sparse_10.56.11PM_Mar-24-20217lav0isx/avg_policy_checkpoint_specs/average_policy_player_0_iter_53000.json"
    )

    class HyperParamSearchCallbacks(DefaultCallbacks):
        def on_episode_start(self, *, worker: "RolloutWorker",
                             base_env: BaseEnv, policies: Dict[PolicyID,
                                                               Policy],
                             episode: MultiAgentEpisode, env_index: int,
                             **kwargs):
            super().on_episode_start(worker=worker,
                                     base_env=base_env,
                                     policies=policies,
                                     episode=episode,
                                     env_index=env_index,
                                     **kwargs)
            if not hasattr(worker,
コード例 #14
0
        ignore_reinit_error=True,
        logging_level=logging.INFO,
        log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False))


    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"


    avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(tmp_env)["model"]

    player_0_avg_pol_spec = StrategySpec.from_json_file(
        "/home/jblanier/git/grl/grl/data/loss_game_nfsp_10_moves_alpha_2.9_sparse_12.07.15AM_May-18-202120bfveou/avg_policy_checkpoint_specs/average_policy_player_0_iter_30000.json"
    )


    class HyperParamSearchCallbacks(DefaultCallbacks):

        def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                             episode: MultiAgentEpisode, env_index: int, **kwargs):
            super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
                                     env_index=env_index, **kwargs)
            if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded:
                avg_policy = worker.policy_map["average_policy"]
                load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec)
                worker.avg_pol_loaded = True

        def on_train_result(self, *, trainer, result: dict, **kwargs):
コード例 #15
0
    def submit_final_br_policy(self, player, policy_num, metadata_dict):
        with self.modification_lock:
            if player < 0 or player >= self._n_players:
                raise ValueError(
                    f"player {player} is out of range. Must be in [0, n_players)."
                )
            if policy_num != self._current_double_oracle_iteration:
                raise ValueError(
                    f"Policy {policy_num} isn't the same as the current double oracle iteration "
                    f"{self._current_double_oracle_iteration}.")

            br_policy_spec: StrategySpec = StrategySpec(
                strategy_id=self._strat_id(player=player,
                                           policy_num=policy_num),
                metadata=metadata_dict,
                pure_strategy_indexes={player: policy_num})

            self._br_episodes_this_iter += metadata_dict[
                "episodes_training_br"]
            self._br_timesteps_this_iter += metadata_dict[
                "timesteps_training_br"]

            self._next_iter_br_spec_lists_for_each_player[player].append(
                br_policy_spec)
            self._player_brs_are_finished_this_iter[player] = True

            all_players_finished_brs_this_ter = all(
                self._player_brs_are_finished_this_iter.values())
            if all_players_finished_brs_this_ter:
                print("Solving restricted game")
                restricted_game_solve_result = self._solve_restricted_game(
                    log_dir=self.log_dir,
                    br_spec_lists_for_each_player=self.
                    _next_iter_br_spec_lists_for_each_player,
                    manager_metadata=self.get_manager_metadata())
                self._latest_metanash_spec_for_each_player = restricted_game_solve_result.latest_metanash_spec_for_each_player

                self._restricted_game_episodes_this_iter += restricted_game_solve_result.episodes_spent_in_solve
                self._restricted_game_timesteps_this_iter += restricted_game_solve_result.timesteps_spent_in_solve

                self._episodes_total += (
                    self._br_episodes_this_iter +
                    self._restricted_game_episodes_this_iter)
                self._timesteps_total += (
                    self._br_timesteps_this_iter +
                    self._restricted_game_timesteps_this_iter)

                br_specs_added_this_iter = {
                    player: player_br_spec_list[-1]
                    for player, player_br_spec_list in
                    self._next_iter_br_spec_lists_for_each_player.items()
                }

                data_to_log = {
                    "episodes_total":
                    self._episodes_total,
                    "timesteps_total":
                    self._timesteps_total,
                    "br_episodes_this_iter":
                    self._br_episodes_this_iter,
                    "br_timesteps_this_iter":
                    self._br_timesteps_this_iter,
                    "restricted_game_episodes_this_iter":
                    self._restricted_game_episodes_this_iter,
                    "restricted_game_timesteps_this_iter":
                    self._restricted_game_timesteps_this_iter,
                    "br_specs_added_this_iter": {
                        player: spec.to_json()
                        for player, spec in br_specs_added_this_iter.items()
                    },
                    "metanash_specs": [
                        spec.to_json()
                        for spec in self._latest_metanash_spec_for_each_player
                    ],
                }
                if all("average_br_reward" in br_spec.metadata
                       for br_spec in br_specs_added_this_iter.values()):
                    data_to_log["player_br_rewards_vs_previous_metanash"] = {
                        player: br_spec.metadata["average_br_reward"]
                        for player, br_spec in
                        br_specs_added_this_iter.items()
                    }

                assert "episodes_total" not in restricted_game_solve_result.extra_data_to_log
                assert "timesteps_total" not in restricted_game_solve_result.extra_data_to_log
                data_to_log.update(
                    restricted_game_solve_result.extra_data_to_log)

                with open(self._json_log_path, "+a") as json_file:
                    json_file.writelines([json.dumps(data_to_log) + '\n'])
                print(
                    colored(
                        f"(Graph this in a notebook) Saved manager stats (including exploitability if applicable) "
                        f"to {self._json_log_path}", "green"))

                for checkpoint_player, player_metanash_spec in enumerate(
                        restricted_game_solve_result.
                        latest_metanash_spec_for_each_player):
                    checkpoint_path = os.path.join(
                        self.log_dir, "xfdo_metanash_specs",
                        f"{checkpoint_player}_metanash_{self._current_double_oracle_iteration}.json"
                    )
                    ensure_dir(checkpoint_path)
                    with open(checkpoint_path, "+w") as checkpoint_spec_file:
                        checkpoint_spec_file.write(
                            player_metanash_spec.to_json())

                # Start the next double oracle iteration here.
                # A double oracle iteration is considered to be training BRs
                # followed by solving the new restricted game.
                self._current_double_oracle_iteration += 1
                self._br_episodes_this_iter = 0
                self._br_timesteps_this_iter = 0
                self._restricted_game_episodes_this_iter = 0
                self._restricted_game_timesteps_this_iter = 0
                self._player_brs_are_finished_this_iter = {
                    p: False
                    for p in range(self._n_players)
                }
                self._br_spec_lists_for_each_player = deepcopy(
                    self._next_iter_br_spec_lists_for_each_player)
コード例 #16
0
def train_off_policy_rl_nfsp_restricted_game(results_dir: str,
                                             scenario: NXDOScenario,
                                             player_to_base_game_action_specs: Dict[int, List[StrategySpec]],
                                             stopping_condition: StoppingCondition,
                                             manager_metadata: Union[dict, None],
                                             print_train_results: bool = True):

    use_openspiel_restricted_game: bool = scenario.use_openspiel_restricted_game
    get_restricted_game_custom_model = scenario.get_restricted_game_custom_model
    env_class = scenario.env_class
    base_env_config = scenario.env_config
    trainer_class = scenario.trainer_class_nfsp
    avg_trainer_class = scenario.avg_trainer_class_nfsp
    policy_classes: Dict[str, Type[Policy]] = scenario.policy_classes_nfsp
    anticipatory_param: float = scenario.anticipatory_param_nfsp
    get_trainer_config = scenario.get_trainer_config_nfsp
    get_avg_trainer_config = scenario.get_avg_trainer_config_nfsp
    get_trainer_config_br = scenario.get_trainer_config_br
    calculate_openspiel_metanash: bool = scenario.calculate_openspiel_metanash
    calculate_openspiel_metanash_at_end: bool = scenario.calculate_openspiel_metanash_at_end
    calc_metanash_every_n_iters: int = scenario.calc_metanash_every_n_iters
    should_log_result_fn = scenario.ray_should_log_result_filter
    metrics_smoothing_episodes_override: int = scenario.metanash_metrics_smoothing_episodes_override

    assert scenario.xdo_metanash_method == "nfsp"

    ray_head_address = manager_metadata.get("ray_head_address", None) if manager_metadata is not None else None
    init_ray_for_scenario(scenario=scenario, head_address=ray_head_address, logging_level=logging.INFO)

    def select_policy(agent_id):
        random_sample = np.random.random()
        if agent_id == 0:
            if random_sample < anticipatory_param:
                return "best_response_0"
            return "average_policy_0"
        elif agent_id == 1:
            if random_sample < anticipatory_param:
                return "best_response_1"
            return "average_policy_1"
        else:
            raise ValueError(f"unexpected agent_id: {agent_id}")

    def assert_not_called(agent_id):
        assert False, "This function should never be called."

    def _create_base_env():
        return env_class(env_config=base_env_config)

    tmp_base_env = _create_base_env()
    restricted_env_config = {"create_env_fn": _create_base_env}

    if use_openspiel_restricted_game:
        restricted_game_class = OpenSpielRestrictedGame
        tmp_env = restricted_game_class(env_config=restricted_env_config)
        restricted_game_action_spaces = [tmp_env.base_action_space for _ in range(2)]
    else:
        restricted_game_class = RestrictedGame
        restricted_env_config["use_delegate_policy_exploration"] = scenario.allow_stochastic_best_responses
        tmp_env = restricted_game_class(env_config=restricted_env_config)
        restricted_game_action_spaces = [Discrete(n=len(player_to_base_game_action_specs[p])) for p in range(2)]

    assert all(restricted_game_action_spaces[0] == space for space in restricted_game_action_spaces)

    print(f"\n\n\n\n\nRestricted game action spaces {restricted_game_action_spaces}\n\n\n\n\n\n")

    scenario_avg_trainer_config = get_avg_trainer_config(tmp_base_env)
    scenario_avg_trainer_config_exploration_config = scenario_avg_trainer_config.get("exploration_config", {})
    if scenario_avg_trainer_config_exploration_config:
        del scenario_avg_trainer_config["exploration_config"]

    scenario_trainer_config = get_trainer_config(tmp_base_env)
    scenario_trainer_config_exploration_config = scenario_trainer_config.get("exploration_config", {})
    if scenario_trainer_config_exploration_config:
        del scenario_trainer_config["exploration_config"]

    delegate_policy_config = merge_dicts(get_trainer_config_br(tmp_base_env), {"explore": scenario.allow_stochastic_best_responses})

    avg_trainer_config = merge_dicts({
        "log_level": "DEBUG",
        "framework": "torch",
        "env": restricted_game_class,
        "env_config": restricted_env_config,
        "num_gpus": 0.0,
        "num_gpus_per_worker": 0.0,
        "num_workers": 0,
        "num_envs_per_worker": 1,
        "multiagent": {
            "policies_to_train": ["average_policy_0", "average_policy_1"],
            "policies": {
                "average_policy_0": (
                policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[0],
                {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}),

                "average_policy_1": (
                policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[1],
                {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}),

                "delegate_policy": (
                policy_classes["delegate_policy"], tmp_base_env.observation_space, tmp_env.base_action_space,
                delegate_policy_config),
            },
            "policy_mapping_fn": assert_not_called,
        },

    }, scenario_avg_trainer_config)
    for _policy_id in ["average_policy_0", "average_policy_1"]:
        if get_restricted_game_custom_model is not None:
            avg_trainer_config["multiagent"]["policies"][_policy_id][3]["model"] = {
                "custom_model": get_restricted_game_custom_model(tmp_env)}

    avg_trainer = avg_trainer_class(config=avg_trainer_config,
                                    logger_creator=get_trainer_logger_creator(
                                        base_dir=results_dir,
                                        scenario_name=f"nfsp_restricted_game_avg_trainer",
                                        should_log_result_fn=should_log_result_fn))

    store_to_avg_policy_buffer = get_store_to_avg_policy_buffer_fn(nfsp_trainer=avg_trainer)

    class NFSPBestResponseCallbacks(DefaultCallbacks):

        def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID,
                                      policy_id: PolicyID, policies: Dict[PolicyID, Policy],
                                      postprocessed_batch: SampleBatch,
                                      original_batches: Dict[Any, Tuple[Policy, SampleBatch]],
                                      **kwargs):
            super().on_postprocess_trajectory(worker=worker, episode=episode, agent_id=agent_id, policy_id=policy_id,
                                              policies=policies, postprocessed_batch=postprocessed_batch,
                                              original_batches=original_batches, **kwargs)

            postprocessed_batch.data["source_policy"] = [policy_id] * len(postprocessed_batch.data["rewards"])

            # All data from both policies will go into the best response's replay buffer.
            # Here we ensure policies not from the best response have the exact same preprocessing as the best response.
            for average_policy_id, br_policy_id in [("average_policy_0", "best_response_0"),
                                                    ("average_policy_1", "best_response_1")]:
                if policy_id == average_policy_id:

                    if "action_probs" in postprocessed_batch:
                        del postprocessed_batch.data["action_probs"]
                    if "behaviour_logits" in postprocessed_batch:
                        del postprocessed_batch.data["behaviour_logits"]

                    br_policy: Policy = policies[br_policy_id]

                    new_batch = br_policy.postprocess_trajectory(
                        sample_batch=postprocessed_batch,
                        other_agent_batches=original_batches,
                        episode=episode)
                    copy_attributes(src_obj=new_batch, dst_obj=postprocessed_batch)
                elif policy_id == br_policy_id:
                    if "q_values" in postprocessed_batch:
                        del postprocessed_batch.data["q_values"]
                    if "action_probs" in postprocessed_batch:
                        del postprocessed_batch.data["action_probs"]
                    del postprocessed_batch.data["action_dist_inputs"]

                if policy_id in ("average_policy_0", "best_response_0"):
                    assert agent_id == 0
                if policy_id in ("average_policy_1", "best_response_1"):
                    assert agent_id == 1

        def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwargs):
            super().on_sample_end(worker=worker, samples=samples, **kwargs)
            assert isinstance(samples, MultiAgentBatch)

            for policy_samples in samples.policy_batches.values():
                if "action_prob" in policy_samples.data:
                    del policy_samples.data["action_prob"]
                if "action_logp" in policy_samples.data:
                    del policy_samples.data["action_logp"]

            for average_policy_id, br_policy_id in [("average_policy_0", "best_response_0"),
                                                    ("average_policy_1", "best_response_1")]:
                for policy_id, policy_samples in samples.policy_batches.items():
                    if policy_id == br_policy_id:
                        store_to_avg_policy_buffer(MultiAgentBatch(policy_batches={
                            average_policy_id: policy_samples
                        }, env_steps=policy_samples.count))
                if average_policy_id in samples.policy_batches:

                    if br_policy_id in samples.policy_batches:
                        all_policies_samples = samples.policy_batches[br_policy_id].concat(
                            other=samples.policy_batches[average_policy_id])
                    else:
                        all_policies_samples = samples.policy_batches[average_policy_id]
                    del samples.policy_batches[average_policy_id]
                    samples.policy_batches[br_policy_id] = all_policies_samples

        def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                           episode: MultiAgentEpisode, env_index: int, **kwargs):
            super().on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode,
                                   env_index=env_index, **kwargs)
            episode_policies = set(episode.agent_rewards.keys())
            if episode_policies == {(0, "average_policy_0"), (1, "best_response_1")}:
                worker.avg_br_reward_deque.add.remote(episode.agent_rewards[(1, "best_response_1")])
            elif episode_policies == {(1, "average_policy_1"), (0, "best_response_0")}:
                worker.avg_br_reward_deque.add.remote(episode.agent_rewards[(0, "best_response_0")])

        def on_train_result(self, *, trainer, result: dict, **kwargs):
            super().on_train_result(trainer=trainer, result=result, **kwargs)
            training_iteration = result["training_iteration"]

            result["avg_br_reward_both_players"] = ray.get(trainer.avg_br_reward_deque.get_mean.remote())

            if (calculate_openspiel_metanash and
                    (training_iteration == 1 or training_iteration % calc_metanash_every_n_iters == 0)):
                base_env = _create_base_env()
                open_spiel_env_config = base_env.open_spiel_env_config
                openspiel_game_version = base_env.game_version
                local_avg_policy_0 = trainer.workers.local_worker().policy_map["average_policy_0"]
                local_avg_policy_1 = trainer.workers.local_worker().policy_map["average_policy_1"]
                exploitability = nxdo_nfsp_measure_exploitability_nonlstm(
                    rllib_policies=[local_avg_policy_0, local_avg_policy_1],
                    poker_game_version=openspiel_game_version,
                    restricted_game_convertors=trainer.get_local_converters(),
                    open_spiel_env_config=open_spiel_env_config,
                    use_delegate_policy_exploration=scenario.allow_stochastic_best_responses
                )
                result["avg_policy_exploitability"] = exploitability

    br_trainer_config = {
        "log_level": "DEBUG",
        "callbacks": NFSPBestResponseCallbacks,
        "env": restricted_game_class,
        "env_config": restricted_env_config,
        "gamma": 1.0,
        "num_gpus": 0.0,
        "num_workers": 0,
        "num_gpus_per_worker": 0.0,
        "num_envs_per_worker": 1,
        "multiagent": {
            "policies_to_train": ["best_response_0", "best_response_1"],
            "policies": {
                "average_policy_0": (
                policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[0],
                {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}),

                "best_response_0": (
                policy_classes["best_response"], tmp_env.observation_space, restricted_game_action_spaces[0],
                {"exploration_config": scenario_trainer_config_exploration_config}),

                "average_policy_1": (
                policy_classes["average_policy"], tmp_env.observation_space, restricted_game_action_spaces[1],
                {"explore": False, "exploration_config": scenario_avg_trainer_config_exploration_config}),

                "best_response_1": (
                policy_classes["best_response"], tmp_env.observation_space, restricted_game_action_spaces[1],
                {"exploration_config": scenario_trainer_config_exploration_config}),

                "delegate_policy": (
                policy_classes["delegate_policy"], tmp_base_env.observation_space, tmp_env.base_action_space,
                delegate_policy_config),
            },
            "policy_mapping_fn": select_policy,
        },
    }
    assert all(restricted_game_action_spaces[0] == space for space in restricted_game_action_spaces), \
        "If not true, the line below with \"get_trainer_config\" may need to be changed to a better solution."
    br_trainer_config = merge_dicts(br_trainer_config, scenario_trainer_config)
    for _policy_id in ["average_policy_0", "average_policy_1", "best_response_0", "best_response_1"]:
        if get_restricted_game_custom_model is not None:
            br_trainer_config["multiagent"]["policies"][_policy_id][3]["model"] = {
                "custom_model": get_restricted_game_custom_model(tmp_env)}

    br_trainer_config["metrics_smoothing_episodes"] = metrics_smoothing_episodes_override

    br_trainer = trainer_class(config=br_trainer_config,
                               logger_creator=get_trainer_logger_creator(
                                   base_dir=results_dir,
                                   scenario_name="nfsp_restricted_game_trainer",
                                   should_log_result_fn=should_log_result_fn))

    avg_br_reward_deque = StatDeque.remote(max_items=br_trainer_config["metrics_smoothing_episodes"])

    def _set_avg_br_rew_deque(worker: RolloutWorker):
        worker.avg_br_reward_deque = avg_br_reward_deque

    br_trainer.workers.foreach_worker(_set_avg_br_rew_deque)
    br_trainer.avg_br_reward_deque = avg_br_reward_deque

    if use_openspiel_restricted_game:
        local_delegate_policy = br_trainer.workers.local_worker().policy_map["delegate_policy"]
        player_converters = []
        for p in range(2):
            print("Creating restricted game obs conversions...")
            convertor = get_restricted_game_obs_conversions(player=p, delegate_policy=local_delegate_policy,
                                                            policy_specs=player_to_base_game_action_specs[p],
                                                            load_policy_spec_fn=create_get_pure_strat_cached(cache={}),
                                                            tmp_base_env=tmp_base_env)
            player_converters.append(convertor)
        for _trainer in [br_trainer, avg_trainer]:
            def _set_worker_converters(worker: RolloutWorker):
                worker_delegate_policy = worker.policy_map["delegate_policy"]
                for p in range(2):
                    worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converters[p]))
                worker_delegate_policy.player_converters = player_converters

            _trainer.workers.foreach_worker(_set_worker_converters)
            _trainer.get_local_converters = lambda: _trainer.workers.local_worker().policy_map[
                "delegate_policy"].player_converters
    else:
        weights_cache = {}
        for _trainer in [br_trainer, avg_trainer]:
            def _set_worker_converters(worker: RolloutWorker):
                worker_delegate_policy = worker.policy_map["delegate_policy"]
                player_converters = []
                for p in range(2):
                    player_converter = RestrictedToBaseGameActionSpaceConverter(
                        delegate_policy=worker_delegate_policy, policy_specs=player_to_base_game_action_specs[p],
                        load_policy_spec_fn=create_get_pure_strat_cached(cache=weights_cache))
                    player_converters.append(player_converter)
                    worker.foreach_env(lambda env: env.set_action_conversion(p, player_converter))
                worker_delegate_policy.player_converters = player_converters

            _trainer.workers.foreach_worker(_set_worker_converters)
            _trainer.get_local_converters = lambda: _trainer.workers.local_worker().policy_map[
                "delegate_policy"].player_converters

    br_trainer.latest_avg_trainer_result = None
    train_iter_count = 0

    for _trainer in [br_trainer, avg_trainer]:
        for policy_id, policy in _trainer.workers.local_worker().policy_map.items():
            policy.policy_id = policy_id

    if len(player_to_base_game_action_specs[0]) == 1:
        final_train_result = {"episodes_total": 0, "timesteps_total": 0, "training_iteration": 0}
        tmp_callback = NFSPBestResponseCallbacks()
        tmp_callback.on_train_result(trainer=br_trainer, result=final_train_result)
    else:
        avg_weights = avg_trainer.get_weights(["average_policy_0", "average_policy_1"])
        br_trainer.workers.foreach_worker(lambda worker: worker.set_weights(avg_weights))
        while True:
            avg_train_results = avg_trainer.train()
            avg_weights = avg_trainer.get_weights(["average_policy_0", "average_policy_1"])
            br_trainer.workers.foreach_worker(lambda worker: worker.set_weights(avg_weights))
            br_trainer.latest_avg_trainer_result = copy.deepcopy(avg_train_results)
            train_iter_results = br_trainer.train()  # do a step (or several) in the main RL loop

            train_iter_count += 1
            if print_train_results:
                # Delete verbose debugging info before printing
                if "hist_stats" in train_iter_results:
                    del train_iter_results["hist_stats"]
                if "td_error" in train_iter_results["info"]["learner"]["best_response_0"]:
                    del train_iter_results["info"]["learner"]["best_response_0"]["td_error"]
                if "td_error" in train_iter_results["info"]["learner"]["best_response_1"]:
                    del train_iter_results["info"]["learner"]["best_response_1"]["td_error"]
                print(pretty_dict_str(train_iter_results))
                print(f"Trainer logdir is {br_trainer.logdir}")

            if stopping_condition.should_stop_this_iter(latest_trainer_result=train_iter_results):
                print("stopping condition met.")
                final_train_result = deepcopy(train_iter_results)
                break

    if calculate_openspiel_metanash_at_end:
        base_env = _create_base_env()
        open_spiel_env_config = base_env.open_spiel_env_config
        openspiel_game_version = base_env.game_version
        local_avg_policy_0 = br_trainer.workers.local_worker().policy_map["average_policy_0"]
        local_avg_policy_1 = br_trainer.workers.local_worker().policy_map["average_policy_1"]
        exploitability = nxdo_nfsp_measure_exploitability_nonlstm(
            rllib_policies=[local_avg_policy_0, local_avg_policy_1],
            poker_game_version=openspiel_game_version,
            restricted_game_convertors=br_trainer.get_local_converters(),
            open_spiel_env_config=open_spiel_env_config,
            use_delegate_policy_exploration=scenario.allow_stochastic_best_responses
        )
        final_train_result["avg_policy_exploitability"] = exploitability

    if "avg_policy_exploitability" in final_train_result:
        print(f"\n\nexploitability: {final_train_result['avg_policy_exploitability']}\n\n")

    avg_policy_specs = []
    for player in range(2):
        strategy_id = f"avg_policy_player_{player}_{datetime_str()}"

        checkpoint_path = save_nfsp_avg_policy_checkpoint(trainer=br_trainer,
                                                          policy_id_to_save=f"average_policy_{player}",
                                                          save_dir=checkpoint_dir(trainer=br_trainer),
                                                          timesteps_training=final_train_result["timesteps_total"],
                                                          episodes_training=final_train_result["episodes_total"],
                                                          checkpoint_name=f"{strategy_id}.h5")

        avg_policy_spec = StrategySpec(
            strategy_id=strategy_id,
            metadata={"checkpoint_path": checkpoint_path,
                      "delegate_policy_specs": [spec.to_json() for spec in player_to_base_game_action_specs[player]]
                      })
        avg_policy_specs.append(avg_policy_spec)

    ray.kill(avg_trainer.workers.local_worker().replay_buffer_actor)
    avg_trainer.cleanup()
    br_trainer.cleanup()
    del avg_trainer
    del br_trainer
    del avg_br_reward_deque

    time.sleep(10)

    assert final_train_result is not None
    return avg_policy_specs, final_train_result
コード例 #17
0
        dashboard_port=find_free_port(),
        ignore_reinit_error=True,
        logging_level=logging.INFO,
        log_to_driver=os.getenv("RAY_LOG_TO_DRIVER", False))


    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"


    avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(tmp_env)["model"]

    player_0_avg_pol_spec = StrategySpec.from_json_file(
        "/home/jblanier/git/grl/grl/data/oshi_zumo_tiny_nfsp_dqn_sparse_01.54.50PM_Apr-08-20218z_hf4wq/avg_policy_checkpoint_specs/average_policy_player_0_iter_214000.json")


    class HyperParamSearchCallbacks(DefaultCallbacks):

        def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                             episode: MultiAgentEpisode, env_index: int, **kwargs):
            super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
                                     env_index=env_index, **kwargs)
            if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded:
                avg_policy = worker.policy_map["average_policy"]
                load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec)
                worker.avg_pol_loaded = True

        def on_train_result(self, *, trainer, result: dict, **kwargs):
            super().on_train_result(trainer=trainer, result=result, **kwargs)
コード例 #18
0
                            ignore_reinit_error=True,
                            logging_level=logging.INFO,
                            log_to_driver=os.getenv("RAY_LOG_TO_DRIVER",
                                                    False))

    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"

    avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(
        tmp_env)["model"]

    player_0_avg_pol_spec = StrategySpec.from_json_file(
        "/home/jblanier/git/grl/grl/data/oshi_zumo_medium_nfsp_dqn_sparse_01.55.05PM_Apr-08-2021ta6arraq/avg_policy_checkpoint_specs/average_policy_player_0_iter_221000.json"
    )

    class HyperParamSearchCallbacks(DefaultCallbacks):
        def on_episode_start(self, *, worker: "RolloutWorker",
                             base_env: BaseEnv, policies: Dict[PolicyID,
                                                               Policy],
                             episode: MultiAgentEpisode, env_index: int,
                             **kwargs):
            super().on_episode_start(worker=worker,
                                     base_env=base_env,
                                     policies=policies,
                                     episode=episode,
                                     env_index=env_index,
                                     **kwargs)
            if not hasattr(worker,
コード例 #19
0
                            ignore_reinit_error=True,
                            logging_level=logging.INFO,
                            log_to_driver=os.getenv("RAY_LOG_TO_DRIVER",
                                                    False))

    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"

    avg_policy_model_config = avg_pol_scenario.get_avg_trainer_config(
        tmp_env)["model"]

    player_0_avg_pol_spec = StrategySpec.from_json_file(
        "/home/jblanier/git/grl/grl/data/leduc_nfsp_dqn_sparse_02.34.06PM_Apr-08-2021bt5ym0l8/avg_policy_checkpoint_specs/average_policy_player_0_iter_263000.json"
    )

    class HyperParamSearchCallbacks(DefaultCallbacks):
        def on_episode_start(self, *, worker: "RolloutWorker",
                             base_env: BaseEnv, policies: Dict[PolicyID,
                                                               Policy],
                             episode: MultiAgentEpisode, env_index: int,
                             **kwargs):
            super().on_episode_start(worker=worker,
                                     base_env=base_env,
                                     policies=policies,
                                     episode=episode,
                                     env_index=env_index,
                                     **kwargs)
            if not hasattr(worker,