def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table( manager_interface=trainer.manager_interface, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names=ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF ) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print("Payoff table is empty so using random weights for static policy.") else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def _do_live_policy_checkpoint(trainer, training_iteration): local_train_policy = trainer.workers.local_worker( ).policy_map[TRAIN_POLICY] checkpoints_dir = os.path.join(experiment_save_dir, "policy_checkpoints") checkpoint_name = f"policy_{trainer.claimed_policy_num}_{datetime_str()}_iter_{training_iteration}.dill" checkpoint_save_path = os.path.join(checkpoints_dir, checkpoint_name) local_train_policy.save_model_weights( save_file_path=checkpoint_save_path, remove_scope_prefix=TRAIN_POLICY) policy_key = os.path.join(base_experiment_name, full_experiment_name, "policy_checkpoints", checkpoint_name) storage_client = connect_storage_client() upload_file(storage_client=storage_client, bucket_name=BUCKET_NAME, object_key=policy_key, local_source_path=checkpoint_save_path) locks_checkpoint_name = f"dch_population_checkpoint_{datetime_str()}" ray_get_and_free( trainer.live_table_tracker.set_latest_key_for_claimed_policy. remote( new_key=policy_key, request_locks_checkpoint_with_name=locks_checkpoint_name))
def __init__(self, config): # Stop event for server handler threads to signal this thread that it's time to shutdown self._stop_event = threading.Event() self._grpc_port = config['grpc_port'] self._grpc_server = grpc.server( futures.ThreadPoolExecutor( max_workers=config['num_thread_workers'])) self._storage_client = connect_storage_client() self._root_save_dir = config['logs_and_payoff_table_save_key_prefix'] \ .replace("DATETIMESTR", datetime_str()) \ .replace("HOSTNAME", gethostname()) \ .replace("PID", str(os.getpid())) self._root_save_dir = f"{CLOUD_PREFIX}{self._root_save_dir}" logger.info(f"root save key prefix is {self._root_save_dir}") self._payoff_table_save_dir = os.path.join(self._root_save_dir, "payoff_tables") servicer = _PopulationServerServicerImpl( stop_event=self._stop_event, payoff_table_save_key_prefix_dir=self._payoff_table_save_dir, storage_client=self._storage_client, bucket_name=BUCKET_NAME, max_ping_interval_seconds_to_track_workers=config[ 'max_ping_interval_seconds_to_track_workers'], num_games_to_play_for_matchup_evals=config[ 'games_per_eval_matchup'], restore_from_payoff_table_key=config[ 'restore_from_payoff_table_key']) add_PopulationServerServicer_to_server(servicer=servicer, server=self._grpc_server) self._grpc_server.add_insecure_port(f'[::]:{self._grpc_port}')
def sample_new_static_policy_weights_for_each_worker_on_episode_start( params): policies = params['policy'] static_policy = policies[STATIC_POLICY] if static_policy.static_policy_selection_probs is None: return selected_policy_index = np.random.choice( a=list(range(len( static_policy.static_policy_selection_probs))), p=static_policy.static_policy_selection_probs) selected_policy_spec: PolicySpec = static_policy.payoff_table.get_policy_for_index( selected_policy_index) assert selected_policy_spec.class_name in ACCEPTED_OPPONENT_POLICY_CLASS_NAMES assert selected_policy_spec.config_key in ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS if static_policy.current_policy_key != selected_policy_spec.key: # print(f"sampled policy {selected_policy_spec.key} (loading weights)") storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=selected_policy_spec.key, force_download=False) static_policy.load_model_weights( load_file_path=weights_local_path, add_scope_prefix=STATIC_POLICY) static_policy.current_policy_key = selected_policy_spec.key
def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table( ) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[ STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads( payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def claim_new_active_policy_after_trainer_init_callback(trainer): def set_train_policy_warmup_target_entropy_proportion(worker): worker.policy_map[TRAIN_POLICY].set_target_entropy_proportion( PIPELINE_WARMUP_ENTROPY_TARGET_PROPORTION) trainer.workers.foreach_worker( set_train_policy_warmup_target_entropy_proportion) trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) trainer.live_table_tracker = LivePolicyPayoffTracker.remote( minio_endpoint=MINIO_ENDPOINT, minio_access_key=MINIO_ACCESS_KEY, minio_secret_key=MINIO_SECRET_KEY, minio_bucket=BUCKET_NAME, manager_host=MANAGER_SERVER_HOST, manager_port=MANAGER_PORT, lock_server_host=LOCK_SERVER_HOST, lock_server_port=LOCK_SERVER_PORT, worker_id=full_experiment_name, policy_class_name=TRAIN_POLICY_CLASS.__name__, policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY, provide_payoff_barrier_sync= not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS) trainer.claimed_policy_num = ray_get_and_free( trainer.live_table_tracker.get_claimed_policy_num.remote()) trainer.are_all_lower_policies_finished = False trainer.payoff_table_needs_update_started = False trainer.payoff_table = None _do_live_policy_checkpoint(trainer=trainer, training_iteration=0) if not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS: # wait for all other learners to also reach this point before continuing ray_get_and_free(trainer.live_table_tracker. wait_at_barrier_for_other_learners.remote()) trainer.new_payoff_table_promise = trainer.live_table_tracker.get_live_payoff_table_dill_pickled.remote( first_wait_for_n_seconds=2) _process_new_live_payoff_table_result_if_ready( trainer=trainer, block_until_result_is_ready=True) if INIT_FROM_POPULATION: init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback( trainer=trainer) else: print( colored( f"Policy {trainer.claimed_policy_num}: (Initializing train policy to random)", "white"))
def init_static_policy_distribution_after_trainer_init_callback( trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME)
def set_policy_weights(weights_key): print(f"weights are {weights_key}") storage_client = connect_storage_client() weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) print("got weights") local_exploit_rllib_policy.load_model_weights( weights_file_path, add_scope_prefix=STATIC_POLICY)
def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) trainer.lock_server_interface = LockServerInterface(server_host=LOCK_SERVER_HOST, port=LOCK_SERVER_PORT, worker_id=f"rectified_psro_learner_{gethostname()}_pid_{os.getpid()}") payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(infinite_retry_on_error=True) if payoff_table is None: assert job_init_policy_key == 'random' assert payoff_table_key is None selection_probs = None print(colored( f"Payoff table is empty so using random weights for static policy.", "white")) else: assert job_init_policy_key != 'random' policies_str = "" for policy_key in payoff_table.get_ordered_keys_in_payoff_matrix(): policies_str += f"{policy_key}" print(colored( f"Payoff Table Policies: {colored(policies_str, 'white')}\n", "white")) selection_probs = get_rectified_selection_probs_for_policy_key(payoff_table=payoff_table, policy_key=job_init_policy_key, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS) print(colored(f"Rectified Policy selection probs: {selection_probs}", "white")) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print("Payoff table is empty so using random weights for static policy.") else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(trainer): storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=BUCKET_NAME, object_name="learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0/learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0_sage_pid_29557_11.47.05PM_May-20-2020/policy_submissions/12.00.49AM_May-21-2020_iter_2263.dill", force_download=False) def worker_set_train_policy_weights(worker): train_policy = worker.policy_map[TRAIN_POLICY] train_policy.load_model_weights(load_file_path=weights_local_path, add_scope_prefix=TRAIN_POLICY) trainer.workers.foreach_worker(worker_set_train_policy_weights)
def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback( trainer): local_static_policy = trainer.workers.local_worker( ).policy_map[STATIC_POLICY] local_train_policy = trainer.workers.local_worker( ).policy_map[TRAIN_POLICY] if not hasattr(local_static_policy, 'static_policy_selection_probs') or \ local_static_policy.static_policy_selection_probs is None: print( colored( f"Policy {trainer.claimed_policy_num}: Payoff table is empty so Initializing train policy to random", "white")) local_train_policy.init_tag = "init from random" return selected_policy_index = np.random.choice( a=list( range( len(local_static_policy.static_policy_selection_probs)) ), p=local_static_policy.static_policy_selection_probs) selected_policy_spec: PolicySpec = local_static_policy.payoff_table.get_policy_for_index( selected_policy_index) local_train_policy.init_tag = f"full init from {selected_policy_spec.key}" # may not necessarily be true in all scripts assert selected_policy_spec.class_name == TRAIN_POLICY_CLASS.__name__ assert selected_policy_spec.config_key == TRAIN_POLICY_MODEL_CONFIG_KEY storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=selected_policy_spec.key, force_download=False) print( colored( f"Policy {trainer.claimed_policy_num}: Initializing train policy to {selected_policy_spec.key}", "white")) # TODO: Here def worker_set_train_policy_weights(worker): train_policy = worker.policy_map[TRAIN_POLICY] train_policy.load_model_weights( load_file_path=weights_local_path, add_scope_prefix=TRAIN_POLICY) trainer.workers.foreach_worker(worker_set_train_policy_weights)
def __init__( self, cache_size=0, record_file_path=None, new_record_entry_every_n_seconds=DEFAULT_RECORD_ENTRY_INTERVAL_SECONDS, extra_data_keys=None): self.catalog = {} self.storage_client = connect_storage_client() self.bucket_name = BUCKET_NAME self.cache = OrderedDict() self.cache_size = cache_size self.record_file_path = record_file_path self.new_record_entry_every_n_seconds = new_record_entry_every_n_seconds self.start_time = time.time() self.last_record_entry_time = self.start_time self.extra_data_keys = extra_data_keys or []
def __init__(self, minio_bucket, manager_host, manager_port, lock_server_host, lock_server_port, worker_id, policy_class_name, policy_config_key, provide_payoff_barrier_sync=False): worker_id = f"live_pop_tracker_{worker_id[worker_id.find('pid'):]}" self._storage_client = connect_storage_client() self._minio_bucket = minio_bucket self._manager_interface = ConsoleManagerInterface( server_host=manager_host, port=manager_port, worker_id=worker_id, storage_client=self._storage_client, minio_bucket_name=self._minio_bucket) self._lock_interface = LockServerInterface( server_host=lock_server_host, port=lock_server_port, worker_id=worker_id) self._policy_class_name = policy_class_name self._policy_config_key = policy_config_key self._claimed_policy_num = None self._claim_new_active_policy() assert self._claimed_policy_num is not None self._locally_cached_matchup_results = {} self._provide_payoff_barrier_sync = provide_payoff_barrier_sync if self._provide_payoff_barrier_sync: self._wait_at_payoff_table_barrier_fn, self._leave_barrier_group_fn = self._lock_interface.join_barrier_group( barrier_name="pt_barrier", member_name=str(self._claimed_policy_num), grace_period_for_others_to_join_s=20.0) else: self._wait_at_payoff_table_barrier_fn = None self._leave_barrier_group_fn = None
def measure_exploitability_of_metanashes_as_they_become_available(): logger = get_logger() storage_client = connect_storage_client() worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" manager_interface = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=worker_id, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, minio_local_dir=DEFAULT_LOCAL_SAVE_PATH) logger.info(f"Started worker \'{worker_id}\'") # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main() ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, local_mode=True) model_config_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=MODEL_CONFIG_KEY, force_download=False) with open(model_config_file_path, 'r') as config_file: model_config = json.load(fp=config_file) example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") obs_space = example_env.observation_space act_space = example_env.action_space preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space) graph = tf.Graph() sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}), graph=graph) def fetch_logits(policy): return { "behaviour_logits": policy.model.last_output(), } _policy_cls = POLICY_CLASS.with_updates( extra_action_fetches_fn=fetch_logits) with graph.as_default(): with sess.as_default(): policy = _policy_cls(obs_space=preprocessor.observation_space, action_space=act_space, config=with_common_config({ 'model': with_base_config( base_config=MODEL_DEFAULTS, extra_config=model_config), 'env': POKER_ENV, 'env_config': POKER_ENV_CONFIG, 'custom_preprocessor': STRATEGO_PREPROCESSOR })) def set_policy_weights(weights_key): weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) policy.load_model_weights(weights_file_path) print("(Started Successfully)") last_payoff_table_key = None while True: payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table( infinite_retry_on_error=True) if payoff_table_key == last_payoff_table_key: time.sleep(20) continue last_payoff_table_key = payoff_table_key metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table( manager_interface=manager_interface, fp_iters=20000, accepted_opponent_policy_class_names=[POLICY_CLASS_NAME], accepted_opponent_model_config_keys=[POKER_ENV_CONFIG], add_payoff_matrix_noise_std_dev=0.000, mix_with_uniform_dist_coeff=None, p_or_lower_rounds_to_zero=0.0) if metanash_probs is not None: policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix( ) policy_dict = { key: prob for key, prob in zip(policy_weights_keys, metanash_probs) } exploitabilitly = measure_exploitability_nonlstm( rllib_policy=policy, poker_game_version=POKER_GAME_VERSION, policy_mixture_dict=policy_dict, set_policy_weights_fn=set_policy_weights) print(f"Exploitability: {exploitabilitly}")
def init_static_policy_distribution_after_trainer_init_callback( trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) logger.info("Initializing trainer lock server interface") trainer.lock_server_interface = LockServerInterface( server_host=LOCK_SERVER_HOST, port=LOCK_SERVER_PORT, worker_id=full_experiment_name) orig_selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table( manager_interface=trainer.manager_interface, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names= ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys= ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF) if orig_selection_probs is None: assert payoff_table is None assert payoff_table_key is None selection_probs = None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Original Selection Probs: {orig_selection_probs}") policy_key_to_leave_out = get_unreserved_policy_key_with_priorities( lock_server_interface=trainer.lock_server_interface, policy_keys=payoff_table.get_ordered_keys_in_payoff_matrix( ), policy_priorities=orig_selection_probs) if policy_key_to_leave_out is None: selection_probs = orig_selection_probs print( "No policy keys available to reserve so using unaltered selection probs" ) else: chosen_policy_selection_prob = orig_selection_probs[ payoff_table.get_policy_spec_for_key( policy_key_to_leave_out).get_payoff_matrix_index()] print( f"\n\nLeaving out {policy_key_to_leave_out}\n" f"(Had selection prob of ({chosen_policy_selection_prob})\n\n" ) selection_probs = get_fp_metanash_for_payoff_table( payoff_table=payoff_table, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names= ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys= ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, leave_out_indexes=[ payoff_table.get_policy_spec_for_key( policy_key_to_leave_out). get_payoff_matrix_index() ], mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF) print(f"Subset Selection Probs: {selection_probs}") if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[ STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads( payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker( worker_set_static_policy_distribution)
SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS = POKER_SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS SUBMISSION_THRESHOLD_STEPS_START = POKER_SUBMISSION_THRESHOLD_STEPS_START SUBMISSION_MIN_TIMESTEPS = POKER_SUBMISSION_MIN_TIMESTEPS SUBMISSION_MAX_TIMESTEPS = POKER_SUBMISSION_MAX_TIMESTEPS CLOUD_PREFIX = os.getenv("CLOUD_PREFIX", "") if __name__ == "__main__": expected_payoff_matrix_size = 0 while True: logging.basicConfig(level=logging.DEBUG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") storage_client = connect_storage_client() size_checker = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=f"size_checker_{gethostname()}_pid_{os.getpid()}", storage_client=storage_client, minio_bucket_name=BUCKET_NAME) while True: current_payoff_matrix_size = size_checker.get_size_of_current_payoff_table( ) if current_payoff_matrix_size < expected_payoff_matrix_size: logger.info( f"waiting for payoff matrix to reach size {expected_payoff_matrix_size} (currently {current_payoff_matrix_size})..." )
def perform_eval_matchups_as_they_are_available(i): logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) # if os.getenv("EVALUATOR_USE_GPU") == 'true': # os.environ['CUDA_VISIBLE_DEVICES'] = str(i % len(''.join(i for i in os.environ['CUDA_VISIBLE_DEVICES'] if i.isdigit()))) storage_client = connect_storage_client() worker_id = f"evaluator_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" manager_interface = EvaluatorManagerInterface( server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=worker_id, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, minio_local_dir=DEFAULT_LOCAL_SAVE_PATH) logger.info(f"Started worker \'{worker_id}\'") env = ENV_CLASS(env_config=POKER_ENV_CONFIG) while True: matchup = manager_interface.get_eval_matchup( infinite_retry_on_error=True) if matchup is None: # no matchups available right now, wait a bit and try again time.sleep(WAIT_SECONDS_BEFORE_TRYING_AGAIN_IF_NO_MATCHUPS) continue logger.info( f"[{worker_id}] Evaluating Matchup:\n{pretty_print(matchup)}") as_policy: PolicySpec = matchup['as_policy'] against_policy: PolicySpec = matchup['against_policy'] num_games_to_play = matchup['num_games'] get_as_policy_fn = make_get_policy_fn( model_weights_object_key=as_policy.key, model_config_object_key=as_policy.config_key, policy_name=as_policy.key, policy_class_name=as_policy.class_name, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, download_lock=download_lock, manual_config=None) get_against_policy_fn = make_get_policy_fn( model_weights_object_key=against_policy.key, model_config_object_key=against_policy.config_key, policy_name=against_policy.key, policy_class_name=against_policy.class_name, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, download_lock=download_lock, manual_config=None) as_policy_payoff, tie_percentage = eval_policy_matchup( get_policy_fn_a=get_as_policy_fn, get_policy_fn_b=get_against_policy_fn, env=env, stratego_env_config=POKER_ENV_CONFIG, games_per_matchup=num_games_to_play) logger.info( f"\n\nFinal Result for {as_policy.key}\nvs\n{against_policy.key}\n{as_policy_payoff}\n\n" ) try: manager_interface.submit_eval_matchup_result( as_policy_key=as_policy.key, against_policy_key=against_policy.key, as_policy_avg_payoff=as_policy_payoff, games_played=num_games_to_play, infinite_retry_on_error=True) except FalseConfirmationError as err: logger.warning( f"[{worker_id}] Got False confirmation from manager:\n{err}")
def stop_and_submit_if_not_improving_on_train_result_callback(params): trainer = params['trainer'] result = params['result'] result['stop_signal'] = False should_submit = False submit_reason = None if not hasattr(trainer, 'previous_threshold_check_reward'): trainer.previous_threshold_check_reward = -100.0 trainer.next_threshold_check_timesteps = SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS + SUBMISSION_THRESHOLD_STEPS_START print( f"fist threshold check at {trainer.next_threshold_check_timesteps} timesteps" ) if result['timesteps_total'] >= SUBMISSION_THRESHOLD_STEPS_START and \ SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS is not None and \ SUBMISSION_IMPROVEMENT_THRESHOLD_REWARD is not None: if result[ 'timesteps_total'] >= trainer.next_threshold_check_timesteps: trainer.next_threshold_check_timesteps = max( trainer.next_threshold_check_timesteps + SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS, result['timesteps_total'] + 1) target_reward = trainer.previous_threshold_check_reward + SUBMISSION_IMPROVEMENT_THRESHOLD_REWARD result['target_reward'] = target_reward measured_reward = result['policy_reward_mean'][ TRAIN_POLICY] print( f"{result['timesteps_total']} timesteps: {TRAIN_POLICY} reward: {measured_reward}, target reward: {target_reward}" ) if measured_reward < target_reward and \ (SUBMISSION_MIN_TIMESTEPS is None or result['timesteps_total'] >= SUBMISSION_MIN_TIMESTEPS): should_submit = True submit_reason = f"plateaued at {measured_reward} reward" print( f"{result['timesteps_total']} timesteps: {TRAIN_POLICY} didn\'t reach target reward. Submitting policy." ) else: print( f"next threshold check at {trainer.next_threshold_check_timesteps} timesteps" ) trainer.previous_threshold_check_reward = measured_reward if SUBMISSION_MAX_TIMESTEPS is not None and result[ 'timesteps_total'] >= SUBMISSION_MAX_TIMESTEPS: should_submit = True submit_reason = f"hit max timesteps of {SUBMISSION_MAX_TIMESTEPS}" print(f"Trainer hit max timesteps. Submitting policy.") if should_submit: assert submit_reason is not None result['stop_signal'] = True local_train_policy = trainer.workers.local_worker( ).policy_map[TRAIN_POLICY] tags = [ *SUBMISSION_POLICY_TAGS, submit_reason, f"timesteps: {result['timesteps_total']}", f"episodes: {result['episodes_total']}" ] if hasattr(local_train_policy, "init_tag"): tags += local_train_policy.init_tag checkpoints_dir = os.path.join(experiment_save_dir, "policy_submissions") checkpoint_name = f"{datetime_str()}_iter_{result['training_iteration']}.dill" checkpoint_save_path = os.path.join(checkpoints_dir, checkpoint_name) local_train_policy.save_model_weights( save_file_path=checkpoint_save_path, remove_scope_prefix=TRAIN_POLICY) policy_key = os.path.join(base_experiment_name, full_experiment_name, "policy_submissions", checkpoint_name) storage_client = connect_storage_client() upload_file(storage_client=storage_client, bucket_name=BUCKET_NAME, object_key=policy_key, local_source_path=checkpoint_save_path) trainer.manager_interface.submit_new_policy_for_population( policy_weights_key=policy_key, policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY, policy_class_name=TRAIN_POLICY_CLASS.__name__, policy_tags=tags)
def submit_ocassionaly_on_train_result_callback(params): trainer = params['trainer'] result = params['result'] should_submit = False submit_reason = None if not hasattr(trainer, 'next_submit'): trainer.next_submit = SUBMISSION_IMPROVEMENT_THRESHOLD_PER_STEPS + SUBMISSION_THRESHOLD_STEPS_START if result['timesteps_total'] >= trainer.next_submit: trainer.next_submit = max( trainer.next_submit + SUBMISSION_IMPROVEMENT_THRESHOLD_PER_STEPS + SUBMISSION_THRESHOLD_STEPS_START, result['timesteps_total'] + 1) if SUBMISSION_MIN_STEPS is None or result[ 'timesteps_total'] >= SUBMISSION_MIN_STEPS: should_submit = True submit_reason = f"periodic_checkpoint" print( colored( f"{result['timesteps_total']} steps: {TRAIN_POLICY} didn\'t reach target reward. Submitting policy.", "white")) else: print( colored(f"next submit at {trainer.next_submit} steps", "white")) if should_submit: assert submit_reason is not None local_train_policy = trainer.workers.local_worker( ).policy_map[TRAIN_POLICY] tags = [ *SUBMISSION_POLICY_TAGS, submit_reason, f"timesteps: {result['timesteps_total']}", f"episodes: {result['episodes_total']}", f"iter: {result['training_iteration']}" ] if hasattr(local_train_policy, "init_tag"): tags += local_train_policy.init_tag checkpoints_dir = os.path.join(experiment_save_dir, "policy_submissions") checkpoint_name = f"{datetime_str()}_iter_{result['training_iteration']}.dill" checkpoint_save_path = os.path.join(checkpoints_dir, checkpoint_name) local_train_policy.save_model_weights( save_file_path=checkpoint_save_path, remove_scope_prefix=TRAIN_POLICY) policy_key = os.path.join(base_experiment_name, full_experiment_name, "policy_submissions", checkpoint_name) storage_client = connect_storage_client() upload_file(storage_client=storage_client, bucket_name=BUCKET_NAME, object_key=policy_key, local_source_path=checkpoint_save_path) trainer.manager_interface.submit_new_policy_for_population( policy_weights_key=policy_key, policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY, policy_class_name=TRAIN_POLICY_CLASS.__name__, policy_tags=tags)