def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                      episode: MultiAgentEpisode, env_index: int, **kwargs):
     super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
                              env_index=env_index, **kwargs)
     if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded:
         avg_policy = worker.policy_map["average_policy"]
         load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec)
         worker.avg_pol_loaded = True
Beispiel #2
0
 def on_episode_start(self, *, worker: "RolloutWorker",
                      base_env: BaseEnv, policies: Dict[PolicyID,
                                                        Policy],
                      episode: MultiAgentEpisode, env_index: int,
                      **kwargs):
     super().on_episode_start(worker=worker,
                              base_env=base_env,
                              policies=policies,
                              episode=episode,
                              env_index=env_index,
                              **kwargs)
     metanash_policy = worker.policy_map["metanash"]
     load_pure_strat(policy=metanash_policy,
                     pure_strat_spec=opponent_policy_distribution.
                     sample_policy_spec())
Beispiel #3
0
 def on_episode_start(self, *, worker: "RolloutWorker",
                      base_env: BaseEnv, policies: Dict[PolicyID,
                                                        Policy],
                      episode: MultiAgentEpisode, env_index: int,
                      **kwargs):
     super().on_episode_start(worker=worker,
                              base_env=base_env,
                              policies=policies,
                              episode=episode,
                              env_index=env_index,
                              **kwargs)
     metanash_policy = worker.policy_map["metanash"]
     load_pure_strat(
         policy=metanash_policy,
         checkpoint_path=opponent_policy_distribution.
         sample_policy_spec().metadata["checkpoint_path"].replace(
             "deploy", "jblanier"))
Beispiel #4
0
        def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv,
                             policies: Dict[str, Policy],
                             episode: MultiAgentEpisode, env_index: int, **kwargs):

            # Sample new pure strategy policy weights from the opponent strategy distribution for the best response to
            # train against. For better runtime performance, this function can be modified to load new weights
            # only every few episodes instead.
            resample_pure_strat_every_n_episodes = 1
            metanash_policy: Policy = policies[f"metanash"]
            opponent_policy_distribution: PolicySpecDistribution = worker.opponent_policy_distribution
            time_for_resample = (not hasattr(metanash_policy, "episodes_since_resample") or
                                 metanash_policy.episodes_since_resample >= resample_pure_strat_every_n_episodes)
            if time_for_resample and opponent_policy_distribution is not None:
                new_pure_strat_spec: StrategySpec = opponent_policy_distribution.sample_policy_spec()
                # noinspection PyTypeChecker
                load_pure_strat(policy=metanash_policy, pure_strat_spec=new_pure_strat_spec)
                metanash_policy.episodes_since_resample = 1
            elif opponent_policy_distribution is not None:
                metanash_policy.episodes_since_resample += 1
Beispiel #5
0
def run_poker_evaluation_loop(scenario_name: str, eval_dispatcher_port: int, eval_dispatcher_host: str):
    scenario: PSROScenario = scenario_catalog.get(scenario_name=scenario_name)
    if not isinstance(scenario, PSROScenario):
        raise TypeError(f"Only instances of {PSROScenario} can be used here. {scenario.name} is a {type(scenario)}.")

    eval_dispatcher = RemoteEvalDispatcherClient(port=eval_dispatcher_port, remote_server_host=eval_dispatcher_host)

    env = scenario.env_class(env_config=scenario.env_config)
    num_players = 2

    trainer_config = scenario.get_trainer_config(env)
    trainer_config["explore"] = scenario.allow_stochastic_best_responses

    policies = [scenario.policy_classes["eval"](env.observation_space,
                                                env.action_space,
                                                with_common_config(trainer_config))
                for _ in range(num_players)]

    while True:
        policy_specs_for_each_player, required_games_to_play = eval_dispatcher.take_eval_job()

        if policy_specs_for_each_player is None:
            time.sleep(2)
        else:
            if len(policy_specs_for_each_player) != 2:
                raise NotImplementedError(f"This evaluation code only supports two player games. "
                                          f"{len(policy_specs_for_each_player)} players were requested.")

            # print(f"Got eval matchup:")
            # for spec in policy_specs_for_each_player:
            #     print(f"spec: {spec.to_json()}")

            for policy, spec in zip(policies, policy_specs_for_each_player):
                load_pure_strat(policy=policy, pure_strat_spec=spec)

            total_payoffs_per_player = np.zeros(shape=num_players, dtype=np.float64)

            # max_reward = None
            # min_reward = None
            # time_since_last_output = time.time()
            for game in range(required_games_to_play):
                # if game % 1000 == 0:
                #     now = time.time()
                #     print(f"{policy_specs_for_each_player[0].id} vs "
                #           f"{policy_specs_for_each_player[1].id}: "
                #           f"{game}/{required_games_to_play} games played, {now - time_since_last_output} seconds")
                #     time_since_last_output = now

                payoffs_per_player_this_episode = run_episode(env=env, policies_for_each_player=policies)
                total_payoffs_per_player += payoffs_per_player_this_episode

                # if max_reward is None or max(payoffs_per_player_this_episode) > max_reward:
                #     max_reward = max(payoffs_per_player_this_episode)
                # if min_reward is None or min(payoffs_per_player_this_episode) < min_reward:
                #     min_reward = min(payoffs_per_player_this_episode)

            payoffs_per_player = total_payoffs_per_player / required_games_to_play

            print(f"payoffs per player:"
                  f"{policy_specs_for_each_player[0].id} vs "
                  f"{policy_specs_for_each_player[1].id}: "
                  f"{payoffs_per_player}")

            eval_dispatcher.submit_eval_job_result(
                policy_specs_for_each_player_tuple=policy_specs_for_each_player,
                payoffs_for_each_player=payoffs_per_player,
                games_played=required_games_to_play
            )
Beispiel #6
0
 def _set_br_initial_weights(worker: RolloutWorker):
     br_policy = worker.policy_map["best_response"]
     load_pure_strat(policy=br_policy, checkpoint_path=previous_br_checkpoint_path)
Beispiel #7
0
 def _set_worker_metanash(worker: RolloutWorker):
     if metanash_specs_for_players is not None:
         metanash_policy = worker.policy_map["metanash"]
         metanash_strategy_spec: StrategySpec = metanash_specs_for_players[other_player]
         load_pure_strat(policy=metanash_policy, pure_strat_spec=metanash_strategy_spec)