Exemple #1
0
 def set_policy_weights(weights_key):
     weights_file_path, _ = maybe_download_object(
         storage_client=storage_client,
         bucket_name=BUCKET_NAME,
         object_name=weights_key,
         force_download=False)
     policy.load_model_weights(weights_file_path)
Exemple #2
0
        def sample_new_static_policy_weights_for_each_worker_on_episode_start(
                params):
            policies = params['policy']

            static_policy = policies[STATIC_POLICY]

            if static_policy.static_policy_selection_probs is None:
                return

            selected_policy_index = np.random.choice(
                a=list(range(len(
                    static_policy.static_policy_selection_probs))),
                p=static_policy.static_policy_selection_probs)
            selected_policy_spec: PolicySpec = static_policy.payoff_table.get_policy_for_index(
                selected_policy_index)
            assert selected_policy_spec.class_name in ACCEPTED_OPPONENT_POLICY_CLASS_NAMES
            assert selected_policy_spec.config_key in ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS

            if static_policy.current_policy_key != selected_policy_spec.key:
                # print(f"sampled policy {selected_policy_spec.key} (loading weights)")
                storage_client = connect_storage_client()
                weights_local_path, _ = maybe_download_object(
                    storage_client=storage_client,
                    bucket_name=BUCKET_NAME,
                    object_name=selected_policy_spec.key,
                    force_download=False)
                static_policy.load_model_weights(
                    load_file_path=weights_local_path,
                    add_scope_prefix=STATIC_POLICY)
                static_policy.current_policy_key = selected_policy_spec.key
Exemple #3
0
    def get_latest_payoff_table(self, infinite_retry_on_error: bool = True):
        while True:
            try:
                request = Empty()
                response: PayoffTableKey = self._stub.GetLatestPayoffTableKey(
                    request)
                break
            except grpc.RpcError as err:
                if infinite_retry_on_error:
                    logger.warning(
                        f"grpc.RPCError raised while getting latest payoff table:\n{err}\n"
                        f"(retrying in {_INFINITE_RETRY_INTERVAL_SECONDS} seconds)"
                    )
                    time.sleep(_INFINITE_RETRY_INTERVAL_SECONDS)
                else:
                    raise
        if response.payoff_table_is_empty:
            logger.debug("Latest payoff table is empty (None)")
            return None, None

        payoff_table_local_path, _ = maybe_download_object(
            storage_client=self._storage_client,
            bucket_name=self._minio_bucket_name,
            object_name=response.key,
            local_directory=self._minio_local_dir,
            force_download=False)

        latest_payoff_table = PayoffTable.from_dill_file(
            dill_file_path=payoff_table_local_path)

        return latest_payoff_table, response.key
 def set_policy_weights(weights_key):
     print(f"weights are {weights_key}")
     storage_client = connect_storage_client()
     weights_file_path, _ = maybe_download_object(
         storage_client=storage_client,
         bucket_name=BUCKET_NAME,
         object_name=weights_key,
         force_download=False)
     print("got weights")
     local_exploit_rllib_policy.load_model_weights(
         weights_file_path, add_scope_prefix=STATIC_POLICY)
 def sample_new_policy_weights_from_population():
     new_policy_key = np.random.choice(a=list(population_policy_keys_to_selection_probs.keys()),
                                       p=list(population_policy_keys_to_selection_probs.values()))
     if new_policy_key != policy.current_model_weights_key:
         with download_lock:
             weights_file_path, _ = maybe_download_object(storage_client=storage_client,
                                                          bucket_name=minio_bucket_name,
                                                          object_name=new_policy_key,
                                                          force_download=False)
             policy.load_model_weights(weights_file_path)
             logger.debug(f"Sampling new population weights from {new_policy_key}")
         policy.current_model_weights_key = new_policy_key
Exemple #6
0
    def __init__(self,
                 stop_event,
                 payoff_table_save_key_prefix_dir,
                 storage_client,
                 bucket_name,
                 max_ping_interval_seconds_to_track_workers,
                 num_games_to_play_for_matchup_evals,
                 restore_from_payoff_table_key=None):

        self._stop_event = stop_event
        self.payoff_table_save_key_prefix_dir = payoff_table_save_key_prefix_dir
        self._storage_client = storage_client
        self._bucket_name = bucket_name
        self._max_ping_interval_seconds_to_track_workers = max_ping_interval_seconds_to_track_workers
        self._num_games_to_play_for_matchup_evals = num_games_to_play_for_matchup_evals

        self._payoff_table_modification_lock = Lock()
        self._recent_worker_pings = PriorityQueue()
        self._worker_ping_modification_lock = Lock()
        self._start_time = time.time()

        self._eval_matchup_cache_lock = RLock()
        self._eval_matchup_cache = {}
        self._externally_requested_eval_queue = Queue()

        self._recent_eval_match_requests_lock = RLock()
        self._recent_eval_match_requests = {}

        self._latest_checkpoint_key = os.path.join(
            self.payoff_table_save_key_prefix_dir, "latest.dill")
        logger.info(
            colored(
                f"Latest Manager Payoff Table Checkpoint will always be at {self._latest_checkpoint_key} "
                f"(local file path: {get_default_path_on_disk_for_minio_key(self._latest_checkpoint_key)})",
                "yellow"))

        if restore_from_payoff_table_key is not None:
            payoff_table_local_path, _ = maybe_download_object(
                storage_client=self._storage_client,
                bucket_name=self._bucket_name,
                object_name=restore_from_payoff_table_key,
                force_download=False)
            logger.info(
                f"restoring payoff table from {payoff_table_local_path}")
            self._payoff_table = PayoffTable.from_dill_file(
                dill_file_path=payoff_table_local_path)
            self._latest_payoff_table_key = restore_from_payoff_table_key
            self._log_policies_in_payoff_matrix()
        else:
            logger.info(f"creating new empty payoff table with no policies")
            self._payoff_table = PayoffTable()
            self._latest_payoff_table_key = None
Exemple #7
0
    def get_weights_by_key(self, policy_key):
        weights = self.get_from_cache(policy_key=policy_key)

        if weights is None:
            load_file_path, _ = maybe_download_object(
                storage_client=self.storage_client,
                bucket_name=self.bucket_name,
                object_name=policy_key)

            with open(load_file_path, "rb") as dill_file:
                weights = load(file=dill_file)

        return weights
Exemple #8
0
    def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(trainer):

        storage_client = connect_storage_client()
        weights_local_path, _ = maybe_download_object(storage_client=storage_client,
                                                      bucket_name=BUCKET_NAME,
                                                      object_name="learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0/learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0_sage_pid_29557_11.47.05PM_May-20-2020/policy_submissions/12.00.49AM_May-21-2020_iter_2263.dill",
                                                      force_download=False)

        def worker_set_train_policy_weights(worker):
            train_policy = worker.policy_map[TRAIN_POLICY]
            train_policy.load_model_weights(load_file_path=weights_local_path,
                                            add_scope_prefix=TRAIN_POLICY)

        trainer.workers.foreach_worker(worker_set_train_policy_weights)
        def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(
                trainer):
            local_static_policy = trainer.workers.local_worker(
            ).policy_map[STATIC_POLICY]
            local_train_policy = trainer.workers.local_worker(
            ).policy_map[TRAIN_POLICY]
            if not hasattr(local_static_policy, 'static_policy_selection_probs') or \
                    local_static_policy.static_policy_selection_probs is None:
                print(
                    colored(
                        f"Policy {trainer.claimed_policy_num}: Payoff table is empty so Initializing train policy to random",
                        "white"))
                local_train_policy.init_tag = "init from random"
                return

            selected_policy_index = np.random.choice(
                a=list(
                    range(
                        len(local_static_policy.static_policy_selection_probs))
                ),
                p=local_static_policy.static_policy_selection_probs)
            selected_policy_spec: PolicySpec = local_static_policy.payoff_table.get_policy_for_index(
                selected_policy_index)
            local_train_policy.init_tag = f"full init from {selected_policy_spec.key}"

            # may not necessarily be true in all scripts
            assert selected_policy_spec.class_name == TRAIN_POLICY_CLASS.__name__
            assert selected_policy_spec.config_key == TRAIN_POLICY_MODEL_CONFIG_KEY
            storage_client = connect_storage_client()
            weights_local_path, _ = maybe_download_object(
                storage_client=storage_client,
                bucket_name=BUCKET_NAME,
                object_name=selected_policy_spec.key,
                force_download=False)

            print(
                colored(
                    f"Policy {trainer.claimed_policy_num}: Initializing train policy to {selected_policy_spec.key}",
                    "white"))

            # TODO: Here
            def worker_set_train_policy_weights(worker):
                train_policy = worker.policy_map[TRAIN_POLICY]
                train_policy.load_model_weights(
                    load_file_path=weights_local_path,
                    add_scope_prefix=TRAIN_POLICY)

            trainer.workers.foreach_worker(worker_set_train_policy_weights)
Exemple #10
0
                                          full_experiment_name,
                                          "policy_submissions",
                                          checkpoint_name)
                storage_client = connect_storage_client()
                upload_file(storage_client=storage_client,
                            bucket_name=BUCKET_NAME,
                            object_key=policy_key,
                            local_source_path=checkpoint_save_path)
                trainer.manager_interface.submit_new_policy_for_population(
                    policy_weights_key=policy_key,
                    policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY,
                    policy_class_name=TRAIN_POLICY_CLASS.__name__,
                    policy_tags=tags)

        train_model_config_local_file_path, _ = maybe_download_object(
            storage_client=storage_client,
            bucket_name=BUCKET_NAME,
            object_name=TRAIN_POLICY_MODEL_CONFIG_KEY)
        with open(train_model_config_local_file_path, 'r') as config_file:
            train_model_config = json.load(fp=config_file)

        static_model_config_local_file_path, _ = maybe_download_object(
            storage_client=storage_client,
            bucket_name=BUCKET_NAME,
            object_name=STATIC_POLICY_MODEL_CONFIG_KEY)
        with open(static_model_config_local_file_path, 'r') as config_file:
            static_model_config = json.load(fp=config_file)

        def train_policy_mapping_fn(agent_id):
            if agent_id == 1:
                return TRAIN_POLICY
            elif agent_id == 0 or agent_id == -1:
Exemple #11
0
def measure_exploitability_of_metanashes_as_they_become_available():
    logger = get_logger()

    storage_client = connect_storage_client()

    worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"

    manager_interface = ConsoleManagerInterface(
        server_host=MANAGER_SEVER_HOST,
        port=MANAGER_PORT,
        worker_id=worker_id,
        storage_client=storage_client,
        minio_bucket_name=BUCKET_NAME,
        minio_local_dir=DEFAULT_LOCAL_SAVE_PATH)

    logger.info(f"Started worker \'{worker_id}\'")

    # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main()
    ray.init(address=os.getenv('RAY_HEAD_NODE'),
             ignore_reinit_error=True,
             local_mode=True)

    model_config_file_path, _ = maybe_download_object(
        storage_client=storage_client,
        bucket_name=BUCKET_NAME,
        object_name=MODEL_CONFIG_KEY,
        force_download=False)

    with open(model_config_file_path, 'r') as config_file:
        model_config = json.load(fp=config_file)

    example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG)

    logger.info("\n\n\n\n\n__________________________________________\n"
                f"LAUNCHED FOR {POKER_GAME_VERSION}\n"
                f"__________________________________________\n\n\n\n\n")

    obs_space = example_env.observation_space
    act_space = example_env.action_space

    preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space)
    graph = tf.Graph()
    sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}),
                      graph=graph)

    def fetch_logits(policy):
        return {
            "behaviour_logits": policy.model.last_output(),
        }

    _policy_cls = POLICY_CLASS.with_updates(
        extra_action_fetches_fn=fetch_logits)

    with graph.as_default():
        with sess.as_default():
            policy = _policy_cls(obs_space=preprocessor.observation_space,
                                 action_space=act_space,
                                 config=with_common_config({
                                     'model':
                                     with_base_config(
                                         base_config=MODEL_DEFAULTS,
                                         extra_config=model_config),
                                     'env':
                                     POKER_ENV,
                                     'env_config':
                                     POKER_ENV_CONFIG,
                                     'custom_preprocessor':
                                     STRATEGO_PREPROCESSOR
                                 }))

    def set_policy_weights(weights_key):
        weights_file_path, _ = maybe_download_object(
            storage_client=storage_client,
            bucket_name=BUCKET_NAME,
            object_name=weights_key,
            force_download=False)
        policy.load_model_weights(weights_file_path)

    print("(Started Successfully)")

    last_payoff_table_key = None
    while True:
        payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table(
            infinite_retry_on_error=True)
        if payoff_table_key == last_payoff_table_key:
            time.sleep(20)
            continue
        last_payoff_table_key = payoff_table_key

        metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table(
            manager_interface=manager_interface,
            fp_iters=20000,
            accepted_opponent_policy_class_names=[POLICY_CLASS_NAME],
            accepted_opponent_model_config_keys=[POKER_ENV_CONFIG],
            add_payoff_matrix_noise_std_dev=0.000,
            mix_with_uniform_dist_coeff=None,
            p_or_lower_rounds_to_zero=0.0)

        if metanash_probs is not None:
            policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix(
            )

            policy_dict = {
                key: prob
                for key, prob in zip(policy_weights_keys, metanash_probs)
            }

            exploitabilitly = measure_exploitability_nonlstm(
                rllib_policy=policy,
                poker_game_version=POKER_GAME_VERSION,
                policy_mixture_dict=policy_dict,
                set_policy_weights_fn=set_policy_weights)
            print(f"Exploitability: {exploitabilitly}")
    def get_policy_fn(stratego_env_config):

        from mprl.utility_services.cloud_storage import maybe_download_object
        from mprl.rl.sac.sac_policy import SACDiscreteTFPolicy
        from mprl.rl.ppo.ppo_stratego_model_policy import PPOStrategoModelTFPolicy
        from mprl.rl.common.stratego_preprocessor import STRATEGO_PREPROCESSOR, StrategoDictFlatteningPreprocessor
        from ray.rllib.agents.trainer import with_common_config, with_base_config
        from ray.rllib.models.catalog import MODEL_DEFAULTS
        from mprl.rl.common.sac_spatial_stratego_model import SAC_SPATIAL_STRATEGO_MODEL
        import ray
        from ray.rllib.utils import try_import_tf
        import json
        import os
        tf = try_import_tf()

        from tensorflow.python.client import device_lib

        def get_available_gpus():
            local_device_protos = device_lib.list_local_devices()
            return [x.name for x in local_device_protos if x.device_type == 'GPU']


        # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main()
        ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, local_mode=True)

        if policy_class_name == 'PPOStrategoModelTFPolicy':
            _policy_class = PPOStrategoModelTFPolicy
        elif policy_class_name == 'SACDiscreteTFPolicy':
            _policy_class = SACDiscreteTFPolicy
        else:
            raise NotImplementedError(f"Eval for policy class \'{policy_class_name}\' not implemented.")

        if model_config_object_key:
            with download_lock:
                model_config_file_path, _ = maybe_download_object(storage_client=storage_client,
                                                                  bucket_name=minio_bucket_name,
                                                                  object_name=model_config_object_key,
                                                                  force_download=False)

                with open(model_config_file_path, 'r') as config_file:
                    model_config = json.load(fp=config_file)
        else:
            model_config = manual_config

        example_env = stratego_env_config['env_class'](env_config=stratego_env_config)
        obs_space = example_env.observation_space
        act_space = example_env.action_space

        preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space)


        graph = tf.Graph()

        if os.getenv("EVALUATOR_USE_GPU") == 'true':
            gpu = 1
        else:
            gpu = 0

        config = tf.ConfigProto(device_count={'GPU': gpu})
        if gpu:
            config.gpu_options.allow_growth = True
        sess = tf.Session(config=config, graph=graph)

        with graph.as_default():
            with sess.as_default():
                policy = _policy_class(
                    obs_space=preprocessor.observation_space,
                    action_space=act_space,
                    config=with_common_config({
                        'model': with_base_config(base_config=MODEL_DEFAULTS, extra_config=model_config),
                        'env': POKER_ENV,
                        'env_config': stratego_env_config,
                        'custom_preprocessor': STRATEGO_PREPROCESSOR,
                    }))

                if model_weights_object_key:
                    with download_lock:
                        weights_file_path, _ = maybe_download_object(storage_client=storage_client,
                                                                     bucket_name=minio_bucket_name,
                                                                     object_name=model_weights_object_key,
                                                                     force_download=False)
                        policy.load_model_weights(weights_file_path)
                    policy.current_model_weights_key = weights_file_path
                else:
                    policy.current_model_weights_key = None

        def policy_fn(observation, policy_state=None):
            if policy_state is None:
                policy_state = policy.get_initial_state()

            current_player_perspective_action_index, policy_state, _ = policy.compute_single_action(
                obs=preprocessor.transform(observation),
                state=policy_state)

            return current_player_perspective_action_index, policy_state

        if population_policy_keys_to_selection_probs is not None:

            def sample_new_policy_weights_from_population():
                new_policy_key = np.random.choice(a=list(population_policy_keys_to_selection_probs.keys()),
                                                  p=list(population_policy_keys_to_selection_probs.values()))
                if new_policy_key != policy.current_model_weights_key:
                    with download_lock:
                        weights_file_path, _ = maybe_download_object(storage_client=storage_client,
                                                                     bucket_name=minio_bucket_name,
                                                                     object_name=new_policy_key,
                                                                     force_download=False)
                        policy.load_model_weights(weights_file_path)
                        logger.debug(f"Sampling new population weights from {new_policy_key}")
                    policy.current_model_weights_key = new_policy_key

            return policy_name, policy_fn, sample_new_policy_weights_from_population

        # policy name must be unique
        return policy_name, policy_fn
Exemple #13
0
    storage_client = connect_storage_client()

    manager_host = "localhost"
    manager_port = 2828

    new_manager_interface = LearnerManagerInterface(
        server_host=manager_host,
        port=manager_port,
        worker_id="rebuild_payoff_learner",
        storage_client=storage_client,
        minio_bucket_name="stratego")

    old_payoff_table_local_path, _ = maybe_download_object(
        storage_client=storage_client,
        bucket_name="stratego",
        object_name=
        "population_server/sage_pid_31932_06_48_20PM_Apr-24-2020/payoff_tables/payoff_table_13_polices_1_pending_sage_pid_31932_07_35_09PM_Apr-25-2020.dill"
    )

    old_payoff_table = PayoffTable.from_dill_file(old_payoff_table_local_path)

    if input(
            f"You're about to add a bunch of policies to the manager at {manager_host}:{manager_port}\n"
            f"Are you sure? Type \'y\' to go through with this: ") != 'y':
        print("(doing nothing and exiting)")
        exit(0)

    for index in range(old_payoff_table.size()):
        policy: PolicySpec = old_payoff_table.get_policy_for_index(index=index)

        new_manager_interface.submit_new_policy_for_population(