def test_add_RewardUncertaintyEnvClassWrapper(): max_steps, grid_size = 20, 3 n_steps = int(max_steps * 8.25) reward_uncertainty_mean, reward_uncertainty_std = 10, 1 MyCoinGame = add_RewardUncertaintyEnvClassWrapper(CoinGame, reward_uncertainty_std, reward_uncertainty_mean) MyAsymCoinGame = add_RewardUncertaintyEnvClassWrapper(AsymCoinGame, reward_uncertainty_std, reward_uncertainty_mean) coin_game = init_env(max_steps, MyCoinGame, grid_size) asymm_coin_game = init_env(max_steps, MyAsymCoinGame, grid_size) all_rewards = [] for env in [coin_game, asymm_coin_game]: obs = env.reset() step_i = 0 for _ in range(n_steps): step_i += 1 actions = {policy_id: random.randint(0, env.NUM_ACTIONS - 1) for policy_id in env.players_ids} obs, reward, done, info = env.step(actions) print("reward", reward) all_rewards.append(reward[env.player_red_id]) all_rewards.append(reward[env.player_blue_id]) if done["__all__"]: obs = env.reset() step_i = 0 assert np.array(all_rewards).mean() > reward_uncertainty_mean - 1.0 assert np.array(all_rewards).mean() < reward_uncertainty_mean + 1.0 assert np.array(all_rewards).std() > reward_uncertainty_std - 0.1 assert np.array(all_rewards).std() < reward_uncertainty_mean + 0.1
def modify_hyperparams_for_the_selected_env(hp): hp["plot_keys"] = (amTFT.PLOT_KEYS + aggregate_and_plot_tensorboard_data.PLOT_KEYS) hp["plot_assemblage_tags"] = ( amTFT.PLOT_ASSEMBLAGE_TAGS + aggregate_and_plot_tensorboard_data.PLOT_ASSEMBLAGE_TAGS) mul_temp = 1.0 hp["punishment_multiplier"] = 3.0 hp["buf_frac"] = 0.125 hp["training_intensity"] = 10 # hp["rollout_length"] = 40 # hp["n_rollout_replicas"] = 20 hp["rollout_length"] = 4 hp["n_rollout_replicas"] = 5 if "CoinGame" in hp["env_name"]: hp["plot_keys"] += vectorized_coin_game.PLOT_KEYS hp["plot_assemblage_tags"] += vectorized_coin_game.PLOT_ASSEMBLAGE_TAGS hp["n_steps_per_epi"] = 20 if hp["debug"] else 100 hp["n_epi"] = 10 if hp["debug"] else 4000 hp["base_lr"] = 0.1 hp["bs_epi_mul"] = 1 hp["both_players_can_pick_the_same_coin"] = False hp["sgd_momentum"] = 0.9 hp["lambda"] = 0.96 hp["alpha"] = 0.0 hp["beta"] = 0.5 hp["debit_threshold"] = 30.0 hp["jitter"] = 0.02 hp["filter_utilitarian"] = False hp["target_network_update_freq"] = 100 * hp["n_steps_per_epi"] hp["last_exploration_temp_value"] = 0.03 * mul_temp hp["temperature_schedule"] = PiecewiseSchedule( endpoints=[ (0, 2.0 * mul_temp), ( int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.20), 0.5 * mul_temp, ), ( int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.60), hp["last_exploration_temp_value"], ), ], outside_value=hp["last_exploration_temp_value"], framework="torch", ) if "AsymCoinGame" in hp["env_name"]: hp["x_limits"] = (-0.5, 3.0) hp["y_limits"] = (-1.1, 0.6) hp["env_class"] = vectorized_coin_game.AsymVectorizedCoinGame elif "MixedMotiveCoinGame" in hp["env_name"]: if "SSDMixedMotiveCoinGame" in hp["env_name"]: hp["debit_threshold"] = 3.0 hp["x_limits"] = (-0.25, 1.0) hp["y_limits"] = (-0.25, 1.5) hp["env_class"] = ssd_mixed_motive_coin_game.SSDMixedMotiveCoinGame else: hp["x_limits"] = (-2.0, 2.0) hp["y_limits"] = (-0.5, 3.0) hp["env_class"] = vectorized_mixed_motive_coin_game.VectMixedMotiveCG hp["both_players_can_pick_the_same_coin"] = True else: hp["x_limits"] = (-0.5, 0.6) hp["y_limits"] = (-0.5, 0.6) hp["env_class"] = vectorized_coin_game.VectorizedCoinGame else: hp["plot_keys"] += matrix_sequential_social_dilemma.PLOT_KEYS hp["plot_assemblage_tags"] += matrix_sequential_social_dilemma.PLOT_ASSEMBLAGE_TAGS hp["base_lr"] = 0.03 hp["bs_epi_mul"] = 1 hp["n_steps_per_epi"] = 20 hp["n_epi"] = 10 if hp["debug"] else 800 hp["lambda"] = 0.96 hp["alpha"] = 0.0 hp["beta"] = 1.0 hp["sgd_momentum"] = 0.0 hp["debit_threshold"] = 10.0 hp["target_network_update_freq"] = 30 * hp["n_steps_per_epi"] hp["last_exploration_temp_value"] = 0.1 * mul_temp hp["temperature_schedule"] = PiecewiseSchedule( endpoints=[ (0, 2.0 * mul_temp), ( int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 0.5 * mul_temp, ), ( int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), hp["last_exploration_temp_value"], ), ], outside_value=hp["last_exploration_temp_value"], framework="torch", ) if "IteratedPrisonersDilemma" in hp["env_name"]: hp["filter_utilitarian"] = False hp["x_limits"] = (-3.5, 0.5) hp["y_limits"] = (-3.5, 0.5) hp["utilitarian_filtering_threshold"] = -2.5 hp["env_class"] = matrix_sequential_social_dilemma.IteratedPrisonersDilemma elif "IteratedAsymBoS" in hp["env_name"]: hp["x_limits"] = (-0.1, 4.1) hp["y_limits"] = (-0.1, 4.1) hp["utilitarian_filtering_threshold"] = 3.2 hp["env_class"] = matrix_sequential_social_dilemma.IteratedAsymBoS else: raise NotImplementedError(f'hp["env_name"]: {hp["env_name"]}') hp["lr_schedule"] = [ (0, 0.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.05), hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9), ] hp["plot_axis_scale_multipliers"] = ( (1 / hp["n_steps_per_epi"]), # for x axis (1 / hp["n_steps_per_epi"]), ) # for y axis hp["env_class"] = add_RewardUncertaintyEnvClassWrapper( env_class=hp["env_class"], reward_uncertainty_std=hp["reward_uncertainty"], ) return hp
def get_rllib_config(hp: dict): stop = { "episodes_total": hp["n_epi"], # 4000 steps in 200 epi } env_config = { "players_ids": ["player_row", "player_col"], "max_steps": hp["n_steps_per_epi"], } MyDQNTorchPolicy = DQNTorchPolicy.with_updates( optimizer_fn=sgd_optimizer_dqn, stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats)) ltft_config = merge_dicts( LTFT_DEFAULT_CONFIG_UPDATE, { "sgd_momentum": 0.9, 'nested_policies': [ # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": { "learn_action": True, "learn_reward": False, "sgd_momentum": 0.75, "explore": False, "timesteps_per_iteration": hp["n_steps_per_epi"], # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"] * hp["spl_lr_mul"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], "loss_fn": torch.nn.CrossEntropyLoss( weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean') }}, ], } ) MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper( IteratedPrisonersDilemma, reward_uncertainty_std=0.1) rllib_config = { "env": MyUncertainIPD, "env_config": env_config, "multiagent": { "policies": { "player_row": ( # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), "player_col": ( LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]), # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [4], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": [4, 2], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, "gamma": 0.5, "min_iter_time_s": 0.33, "seed": tune.grid_search(hp["seeds"]), # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": PiecewiseSchedule( endpoints=[ (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)], outside_value=0.1, framework="torch") }, # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories "num_workers": 0, # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, "batch_mode": "complete_episodes", # # === Debug Settings === # # Whether to write episode stats and videos to the agent log dir. This is # # typically located in ~/ray_results. # "monitor": True, # # Set the ray.rllib.* log level for the agent process and its workers. # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also # # periodically print out summaries of relevant internal dataflow (this is # # also printed out once at startup at the INFO level). When using the # # `rllib train` command, you can also use the `-v` and `-vv` flags as # # shorthand for INFO and DEBUG. # "log_level": "INFO", # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py` # for more usage information. # "callbacks": DefaultCallbacks, "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks, log.get_logging_callbacks_class()), # # Whether to attempt to continue training if a worker crashes. The number # # of currently healthy workers is reported as the "num_healthy_workers" # # metric. # "ignore_worker_failures": False, # # Log system resource metrics to results. This requires `psutil` to be # # installed for sys stats, and `gputil` for GPU metrics. # "log_sys_usage": True, # # Use fake (infinite speed) sampler. For testing only. # "fake_sampler": False, } return rllib_config, env_config, stop
def _get_rllib_config(hp: dict): stop = { "episodes_total": hp["n_epi"], } env_config = _get_env_config(hp) my_uncertain_env_class = add_RewardUncertaintyEnvClassWrapper( hp["env_class"], reward_uncertainty_std=hp["reward_uncertainty_std"]) rllib_config = copy.deepcopy(ltft.DEFAULT_CONFIG) rllib_config.update({ "env": my_uncertain_env_class, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: ( None, hp["env_class"]({}).OBSERVATION_SPACE, hp["env_class"].ACTION_SPACE, {}, ), env_config["players_ids"][1]: ( None, hp["env_class"]({}).OBSERVATION_SPACE, hp["env_class"].ACTION_SPACE, {}, ), }, "policy_mapping_fn": lambda agent_id: agent_id, # When replay_mode=lockstep, RLlib will replay all the agent # transitions at a particular timestep together in a batch. # This allows the policy to implement differentiable shared # computations between agents it controls at that timestep. When # replay_mode=independent, # transitions are replayed independently per policy. # "replay_mode": "lockstep", "observation_fn": ltft.observation_fn, }, # === DQN Models === # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 30 * hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5), # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": hp["hiddens"], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": hp["hiddens"], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy"). "type": exploration.SoftQScheduleWtClustering, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"], "clustering_distance": hp["clustering_distance"], }, "gamma": hp["gamma"], # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] * hp["n_epi"] / hp["log_n_points"]), "min_iter_time_s": 0.0, "seed": tune.grid_search(hp["seeds"]), # === Optimization === "optimizer": { "sgd_momentum": hp["sgd_momentum"], }, # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": hp["lr_schedule"], # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], "training_intensity": hp["training_intensity"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LTFTTorchPolicy supports only 1 worker only otherwise # it would be mixing several opponents trajectories "num_workers": 0, # LTFTTorchPolicy supports only 1 env per worker only # otherwise several episodes would be played at the same # time "num_envs_per_worker": 1, "batch_mode": "complete_episodes", "logger_config": { "wandb": { "project": "LTFT", "group": hp["exp_name"], "api_key_file": os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"), "log_config": True, }, }, # === Debug Settings === "log_level": "INFO", # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and # `examples/custom_metrics_and_callbacks.py` # for more usage information. "callbacks": callbacks.merge_callbacks( ltft.LTFTCallbacks, log.get_logging_callbacks_class(log_full_epi=True, ), ), }) hp, rllib_config, env_config, stop = _modify_config_for_coin_game( hp, rllib_config, env_config, stop) nested_policies_config = rllib_config["nested_policies"] nested_spl_policy_config = nested_policies_config[3]["config_update"] nested_spl_policy_config["train_batch_size"] = (int( hp["n_steps_per_epi"] * hp["bs_epi_mul_spl"]), ) rllib_config["nested_policies"] = nested_policies_config return rllib_config, env_config, stop