Beispiel #1
0
    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST,
                                                            port=MANAGER_PORT,
                                                            worker_id=full_experiment_name,
                                                            storage_client=trainer.storage_client,
                                                            minio_bucket_name=BUCKET_NAME)

        selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
            manager_interface=trainer.manager_interface,
            fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
            accepted_opponent_policy_class_names=ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
            accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
            add_payoff_matrix_noise_std_dev=0.0,
            mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF
        )

        if selection_probs is None:
            assert payoff_table is None
            assert payoff_table_key is None
            print("Payoff table is empty so using random weights for static policy.")
        else:
            print(f"Payoff table loaded from {payoff_table_key}")
            print(f"Policy selection probs: {selection_probs}")

        payoff_table_dill_str = dill.dumps(payoff_table)
        def worker_set_static_policy_distribution(worker):
            worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs
            worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str)
            worker.policy_map[STATIC_POLICY].current_policy_key = None

        trainer.workers.foreach_worker(worker_set_static_policy_distribution)
    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(
            server_host=MANAGER_SEVER_HOST,
            port=MANAGER_PORT,
            worker_id=full_experiment_name,
            storage_client=trainer.storage_client,
            minio_bucket_name=BUCKET_NAME)
        payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(
        )

        if selection_probs is None:
            assert payoff_table is None
            assert payoff_table_key is None
            print(
                "Payoff table is empty so using random weights for static policy."
            )
        else:
            print(f"Payoff table loaded from {payoff_table_key}")
            print(f"Policy selection probs: {selection_probs}")

        payoff_table_dill_str = dill.dumps(payoff_table)

        def worker_set_static_policy_distribution(worker):
            worker.policy_map[
                STATIC_POLICY].static_policy_selection_probs = selection_probs
            worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(
                payoff_table_dill_str)
            worker.policy_map[STATIC_POLICY].current_policy_key = None

        trainer.workers.foreach_worker(worker_set_static_policy_distribution)
        def claim_new_active_policy_after_trainer_init_callback(trainer):
            def set_train_policy_warmup_target_entropy_proportion(worker):
                worker.policy_map[TRAIN_POLICY].set_target_entropy_proportion(
                    PIPELINE_WARMUP_ENTROPY_TARGET_PROPORTION)

            trainer.workers.foreach_worker(
                set_train_policy_warmup_target_entropy_proportion)

            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SERVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)

            trainer.live_table_tracker = LivePolicyPayoffTracker.remote(
                minio_endpoint=MINIO_ENDPOINT,
                minio_access_key=MINIO_ACCESS_KEY,
                minio_secret_key=MINIO_SECRET_KEY,
                minio_bucket=BUCKET_NAME,
                manager_host=MANAGER_SERVER_HOST,
                manager_port=MANAGER_PORT,
                lock_server_host=LOCK_SERVER_HOST,
                lock_server_port=LOCK_SERVER_PORT,
                worker_id=full_experiment_name,
                policy_class_name=TRAIN_POLICY_CLASS.__name__,
                policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY,
                provide_payoff_barrier_sync=
                not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS)
            trainer.claimed_policy_num = ray_get_and_free(
                trainer.live_table_tracker.get_claimed_policy_num.remote())
            trainer.are_all_lower_policies_finished = False
            trainer.payoff_table_needs_update_started = False
            trainer.payoff_table = None
            _do_live_policy_checkpoint(trainer=trainer, training_iteration=0)

            if not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS:
                # wait for all other learners to also reach this point before continuing
                ray_get_and_free(trainer.live_table_tracker.
                                 wait_at_barrier_for_other_learners.remote())

            trainer.new_payoff_table_promise = trainer.live_table_tracker.get_live_payoff_table_dill_pickled.remote(
                first_wait_for_n_seconds=2)
            _process_new_live_payoff_table_result_if_ready(
                trainer=trainer, block_until_result_is_ready=True)

            if INIT_FROM_POPULATION:
                init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(
                    trainer=trainer)
            else:
                print(
                    colored(
                        f"Policy {trainer.claimed_policy_num}: (Initializing train policy to random)",
                        "white"))
Beispiel #4
0
        def init_static_policy_distribution_after_trainer_init_callback(
                trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SEVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)
        def init_static_policy_distribution_after_trainer_init_callback(trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SERVER_HOST,
                                                                port=MANAGER_PORT,
                                                                worker_id=full_experiment_name,
                                                                storage_client=trainer.storage_client,
                                                                minio_bucket_name=BUCKET_NAME)

            trainer.lock_server_interface = LockServerInterface(server_host=LOCK_SERVER_HOST,
                                                    port=LOCK_SERVER_PORT,
                                                    worker_id=f"rectified_psro_learner_{gethostname()}_pid_{os.getpid()}")

            payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(infinite_retry_on_error=True)
            if payoff_table is None:
                assert job_init_policy_key == 'random'
                assert payoff_table_key is None
                selection_probs = None
                print(colored(
                    f"Payoff table is empty so using random weights for static policy.", "white"))
            else:
                assert job_init_policy_key != 'random'
                policies_str = ""
                for policy_key in payoff_table.get_ordered_keys_in_payoff_matrix():
                    policies_str += f"{policy_key}"
                print(colored(
                    f"Payoff Table Policies: {colored(policies_str, 'white')}\n",
                    "white"))

                selection_probs = get_rectified_selection_probs_for_policy_key(payoff_table=payoff_table,
                                                                               policy_key=job_init_policy_key,
                                                                               fp_iters=METANASH_FICTITIOUS_PLAY_ITERS)
                print(colored(f"Rectified Policy selection probs: {selection_probs}", "white"))

            if selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                print("Payoff table is empty so using random weights for static policy.")
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Policy selection probs: {selection_probs}")

            payoff_table_dill_str = dill.dumps(payoff_table)

            def worker_set_static_policy_distribution(worker):
                worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs
                worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str)
                worker.policy_map[STATIC_POLICY].current_policy_key = None

            trainer.workers.foreach_worker(worker_set_static_policy_distribution)
        def init_static_policy_distribution_after_trainer_init_callback(
                trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SEVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)

            logger.info("Initializing trainer lock server interface")
            trainer.lock_server_interface = LockServerInterface(
                server_host=LOCK_SERVER_HOST,
                port=LOCK_SERVER_PORT,
                worker_id=full_experiment_name)

            orig_selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
                manager_interface=trainer.manager_interface,
                fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                accepted_opponent_policy_class_names=
                ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                accepted_opponent_model_config_keys=
                ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                add_payoff_matrix_noise_std_dev=0.0,
                mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)

            if orig_selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                selection_probs = None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Original Selection Probs: {orig_selection_probs}")

                policy_key_to_leave_out = get_unreserved_policy_key_with_priorities(
                    lock_server_interface=trainer.lock_server_interface,
                    policy_keys=payoff_table.get_ordered_keys_in_payoff_matrix(
                    ),
                    policy_priorities=orig_selection_probs)

                if policy_key_to_leave_out is None:
                    selection_probs = orig_selection_probs
                    print(
                        "No policy keys available to reserve so using unaltered selection probs"
                    )
                else:
                    chosen_policy_selection_prob = orig_selection_probs[
                        payoff_table.get_policy_spec_for_key(
                            policy_key_to_leave_out).get_payoff_matrix_index()]
                    print(
                        f"\n\nLeaving out {policy_key_to_leave_out}\n"
                        f"(Had selection prob of ({chosen_policy_selection_prob})\n\n"
                    )

                    selection_probs = get_fp_metanash_for_payoff_table(
                        payoff_table=payoff_table,
                        fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                        accepted_opponent_policy_class_names=
                        ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                        accepted_opponent_model_config_keys=
                        ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                        add_payoff_matrix_noise_std_dev=0.0,
                        leave_out_indexes=[
                            payoff_table.get_policy_spec_for_key(
                                policy_key_to_leave_out).
                            get_payoff_matrix_index()
                        ],
                        mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)
                    print(f"Subset Selection Probs: {selection_probs}")

            if selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Policy selection probs: {selection_probs}")

            payoff_table_dill_str = dill.dumps(payoff_table)

            def worker_set_static_policy_distribution(worker):
                worker.policy_map[
                    STATIC_POLICY].static_policy_selection_probs = selection_probs
                worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(
                    payoff_table_dill_str)
                worker.policy_map[STATIC_POLICY].current_policy_key = None

            trainer.workers.foreach_worker(
                worker_set_static_policy_distribution)
                f"LAUNCHED FOR {POKER_GAME_VERSION}\n"
                f"__________________________________________\n\n\n\n\n")

    storage_client = connect_storage_client()

    ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True)
    logger.info("Ray Web UI at {}".format(ray.get_webui_url()))

    base_experiment_name = f"{CLOUD_PREFIX}learner_{POKER_GAME_VERSION}_sac_arch1_hparam_search_multexp"
    full_experiment_name = f"{base_experiment_name}_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"
    experiment_save_dir = os.path.join(DEFAULT_RESULTS_DIR,
                                       full_experiment_name)

    manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST,
                                                port=MANAGER_PORT,
                                                worker_id=full_experiment_name,
                                                storage_client=storage_client,
                                                minio_bucket_name=BUCKET_NAME)

    selection_probs, _, _ = get_fp_metanash_for_latest_payoff_table(
        manager_interface=manager_interface,
        fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
        accepted_opponent_policy_class_names=
        ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
        accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
        add_payoff_matrix_noise_std_dev=0.0,
        mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)

    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()