act_space = temp_env.action_space trainer_config = with_updates( base_dict=POKER_TRAINER_BASE_CONFIG, updates_dict={ "multiagent": { "policies": { TRAIN_POLICY: (TRAIN_POLICY_CLASS, obs_space, act_space, { 'model': train_model_config, }), STATIC_POLICY: (STATIC_POLICY_CLASS, obs_space, act_space, { 'model': static_model_config, }), }, "policy_mapping_fn": train_policy_mapping_fn, "policies_to_train": [TRAIN_POLICY], }, "callbacks_after_trainer_init": [ init_static_policy_distribution_after_trainer_init_callback, ], "callbacks": { "on_train_result": stop_and_submit_if_not_improving_on_train_result_callback, 'on_episode_start': sample_new_static_policy_weights_for_each_worker_on_episode_start, }, }) # save running script to file
"INFO", "metrics_smoothing_episodes": 10000, "memory_per_worker": 7019430400, "num_envs_per_worker": 1, "num_workers": num_workers, "num_gpus_per_worker": 0, "env": POKER_ENV, "env_config": with_updates(base_dict=poker_env_config, updates_dict={ 'num_envs': 64, }), "multiagent": { "policies": { TRAIN_POLICY: (SACDiscreteTFPolicy, obs_space, act_space, { 'model': model_config, }), STATIC_POLICY: (SACDiscreteTFPolicy, obs_space, act_space, { 'model': model_config, }), }, "policy_mapping_fn": train_policy_mapping_fn, "policies_to_train": [TRAIN_POLICY], }, "callbacks_after_trainer_init": [ lambda trainer: trainer.save_policy_model_configs_to_json(),
trainer_config = with_updates( base_dict=POKER_TRAINER_BASE_CONFIG, updates_dict={ "multiagent": { "policies": { TRAIN_POLICY: (TRAIN_POLICY_CLASS, obs_space, act_space, { 'model': train_model_config, }), STATIC_POLICY: (STATIC_POLICY_CLASS, obs_space, act_space, { 'model': static_model_config, }), }, "policy_mapping_fn": train_policy_mapping_fn, "policies_to_train": [TRAIN_POLICY], }, "callbacks_after_trainer_init": [ claim_new_active_policy_after_trainer_init_callback, # evo_update, ], # "callbacks_after_optim_step": [ # evo_update, # ], "callbacks": { "on_train_result": all_on_train_result_callbacks, 'on_episode_start': sample_new_static_policy_weights_for_each_worker_on_episode_start, }, })
if POKER_GAME_VERSION == LEDUC_POKER: POKER_ENV_CONFIG = { 'version': POKER_GAME_VERSION, } SELECTED_CONFIG_KEY = POKER_ARCH1_MODEL_CONFIG_KEY ENV_CLASS = PokerMultiAgentEnv POKER_TRAINER_BASE_CONFIG = { "log_level": "DEBUG", "metrics_smoothing_episodes": 10000, "memory_per_worker": 1019430400, "num_envs_per_worker": 1, "num_workers": 2, "num_gpus_per_worker": 0.0, "env": POKER_ENV, "env_config": with_updates(base_dict=POKER_ENV_CONFIG, updates_dict={ 'num_envs': 1, }), "buffer_size": int(20000), "learning_starts": 10000, "tau": 0.01, "gamma": 1.0, "train_batch_size": 1024, "optimization": { "actor_learning_rate": 0.01, "critic_learning_rate": 0.01, "entropy_learning_rate": 0.01, }, "max_entropy_target_proportion": 0.0, "batch_mode": 'complete_episodes', "num_gpus": 0,
"DEBUG", "metrics_smoothing_episodes": 10000, "memory_per_worker": 1019430400, "num_envs_per_worker": 1, "num_workers": 2, "num_gpus_per_worker": 0.0, "env": POKER_ENV, "env_config": with_updates(base_dict=POKER_ENV_CONFIG, updates_dict={ 'num_envs': 1, }), "buffer_size": int(20000), "learning_starts": 10000, "tau": 0.01, "gamma": 1.0, "train_batch_size": 1024, "optimization": { "actor_learning_rate": 0.01, "critic_learning_rate": 0.01, "entropy_learning_rate": 0.01,