Esempi in Python per get_logging_callbacks_class, esempi in Python per marltoolbox.utils.log.get_logging_callbacks_class

Esempio n. 1

0

Mostra file

def modify_conf_for_lvl1_training(hp_lvl1, env_config, rllib_config_lvl1,
                                  lvl0_checkpoints):
    lvl0_policy_idx = 1
    lvl1_policy_idx = 0

    lvl0_policy_id = env_config["players_ids"][lvl0_policy_idx]
    lvl1_policy_id = env_config["players_ids"][lvl1_policy_idx]

    # Use a simple DQN as lvl1 agent (instead of amTFT with nested DQN)
    rllib_config_lvl1["multiagent"]["policies"][lvl1_policy_id] = (
        DQNTorchPolicy, hp_lvl1["env"](env_config).OBSERVATION_SPACE,
        hp_lvl1["env"].ACTION_SPACE, {})

    rllib_config_lvl1["callbacks"] = amTFT.get_amTFTCallBacks(
        additionnal_callbacks=[
            log.get_logging_callbacks_class(),
            postprocessing.OverwriteRewardWtWelfareCallback,
            population.PopulationOfIdenticalAlgoCallBacks
        ])

    l1br_configuration_helper = lvl1_best_response.L1BRConfigurationHelper(
        rllib_config_lvl1, lvl0_policy_id, lvl1_policy_id)
    l1br_configuration_helper.define_exp(
        use_n_lvl0_agents_in_each_population=hp_lvl1["n_seeds_lvl0"] //
        hp_lvl1["n_seeds_lvl1"],
        train_n_lvl1_agents=hp_lvl1["n_seeds_lvl1"],
        lvl0_checkpoints=lvl0_checkpoints)
    rllib_config_lvl1 = l1br_configuration_helper.prepare_config_for_lvl1_training(
    )

    # rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3]["explore"] = False
    rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3][
        "working_state"] = "eval_amtft"
    return rllib_config_lvl1

Esempio n. 2

0

Mostra file

File: inequity_aversion.py Progetto: tobiasbaumann1/amd

def main(debug):
    ray.init(num_cpus=os.cpu_count(), num_gpus=0)

    stop = {"episodes_total": 10 if debug else 400}

    env_config = {
        "max_steps": 10,
        "players_ids": ["player_row", "player_col"],
    }

    policies = {
        env_config["players_ids"][0]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {}),
        env_config["players_ids"][1]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {})
    }

    rllib_config = {
        "env":
        IteratedBoSAndPD,
        "env_config":
        env_config,
        "num_gpus":
        0,
        "num_workers":
        1,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: agent_id),
        },
        "framework":
        "torch",
        "gamma":
        0.5,
        "callbacks":
        miscellaneous.merge_callbacks(
            log.get_logging_callbacks_class(),
            postprocessing.OverwriteRewardWtWelfareCallback),
    }

    MyPGTorchPolicy = PGTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.get_postprocessing_welfare_function(
                add_inequity_aversion_welfare=True,
                inequity_aversion_beta=1.0,
                inequity_aversion_alpha=0.0,
                inequity_aversion_gamma=1.0,
                inequity_aversion_lambda=0.5),
            pg_torch_policy.post_process_advantages))
    MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy,
                                         get_policy_class=None)
    tune_analysis = tune.run(MyPGTrainer,
                             stop=stop,
                             checkpoint_freq=10,
                             config=rllib_config)
    ray.shutdown()
    return tune_analysis

Esempio n. 3

0

Mostra file

File: ppo_asymmetric_coin_game.py Progetto: tobiasbaumann1/amd

def main(debug, stop_iters=2000, tf=False):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("PPO_AsymCG")

    ray.init()

    stop = {
        "training_iteration": 2 if debug else stop_iters,
    }

    env_config = {
        "players_ids": ["player_red", "player_blue"],
        "max_steps": 20,
        "grid_size": 3,
        "get_additional_info": True,
    }

    rllib_config = {
        "env": AsymCoinGame,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]:
                (None, AsymCoinGame(env_config).OBSERVATION_SPACE,
                 AsymCoinGame.ACTION_SPACE, {}),
                env_config["players_ids"][1]:
                (None, AsymCoinGame(env_config).OBSERVATION_SPACE,
                 AsymCoinGame.ACTION_SPACE, {}),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        # Size of batches collected from each worker.
        "rollout_fragment_length": 20,
        # Number of timesteps collected for each SGD round. This defines the size
        # of each SGD epoch.
        "train_batch_size": 512,
        "model": {
            "dim": env_config["grid_size"],
            "conv_filters": [[16, [3, 3], 1],
                             [32, [3, 3],
                              1]]  # [Channel, [Kernel, Kernel], Stride]]
        },
        "lr": 5e-3,
        "seed": tune.grid_search(seeds),
        "callbacks": log.get_logging_callbacks_class(),
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": "tf" if tf else "torch",
    }

    tune_analysis = tune.run(PPOTrainer,
                             config=rllib_config,
                             stop=stop,
                             checkpoint_freq=0,
                             checkpoint_at_end=True,
                             name=exp_name)
    ray.shutdown()
    return tune_analysis

Esempio n. 4

0

Mostra file

File: pg_ipd.py Progetto: tobiasbaumann1/amd

def get_rllib_config(seeds, debug=False, stop_iters=200, tf=False):
    stop_config = {
        "training_iteration": 2 if debug else stop_iters,
    }

    env_config = {
        "players_ids": ["player_row", "player_col"],
        "max_steps": 20,
        "get_additional_info": True,
    }

    rllib_config = {
        "env": IteratedPrisonersDilemma,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    None,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    {}
                ),
                env_config["players_ids"][1]: (
                    None,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    {}
                ),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        "seed": tune.grid_search(seeds),
        "callbacks": log.get_logging_callbacks_class(),
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": "tf" if tf else "torch",
    }

    return rllib_config, stop_config

Esempio n. 5

0

Mostra file

def _modify_policy_to_use_welfare(rllib_config, welfare):
    MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.welfares_postprocessing_fn(),
            postprocess_nstep_and_prio,
        ))

    policies = rllib_config["multiagent"]["policies"]
    new_policies = {}
    for policies_id, policy_tuple in policies.items():
        new_policies[policies_id] = list(policy_tuple)
        new_policies[policies_id][0] = MyCoopDQNTorchPolicy
        if welfare == postprocessing.WELFARE_UTILITARIAN:
            new_policies[policies_id][3].update(
                {postprocessing.ADD_UTILITARIAN_WELFARE: True})
        elif welfare == postprocessing.WELFARE_INEQUITY_AVERSION:
            add_ia_w = True
            ia_alpha = 0.0
            ia_beta = 0.5
            ia_gamma = 0.96
            ia_lambda = 0.96
            inequity_aversion_parameters = (
                add_ia_w,
                ia_alpha,
                ia_beta,
                ia_gamma,
                ia_lambda,
            )
            new_policies[policies_id][3].update({
                postprocessing.ADD_INEQUITY_AVERSION_WELFARE:
                inequity_aversion_parameters
            })
    rllib_config["multiagent"]["policies"] = new_policies
    rllib_config["callbacks"] = callbacks.merge_callbacks(
        log.get_logging_callbacks_class(),
        postprocessing.OverwriteRewardWtWelfareCallback,
    )

    return rllib_config

Esempio n. 6

0

Mostra file

File: l1br_amtft.py Progetto: longtermrisk/marltoolbox

def modify_conf_for_lvl1_training(hp_lvl1, env_config, rllib_config_lvl1,
                                  lvl0_checkpoints):
    lvl0_policy_idx = 1
    lvl1_policy_idx = 0

    lvl0_policy_id = env_config["players_ids"][lvl0_policy_idx]
    lvl1_policy_id = env_config["players_ids"][lvl1_policy_idx]

    # Use a simple DQN as lvl1 agent (instead of amTFT with nested DQN)
    rllib_config_lvl1["multiagent"]["policies"][lvl1_policy_id] = (
        DQNTorchPolicy,
        hp_lvl1["env_class"](env_config).OBSERVATION_SPACE,
        hp_lvl1["env_class"].ACTION_SPACE,
        {},
    )

    rllib_config_lvl1["callbacks"] = callbacks.merge_callbacks(
        amTFT.AmTFTCallbacks,
        log.get_logging_callbacks_class(log_full_epi=False,
                                        log_full_epi_interval=100),
    )

    l1br_configuration_helper = lvl1_best_response.L1BRConfigurationHelper(
        rllib_config_lvl1, lvl0_policy_id, lvl1_policy_id)
    l1br_configuration_helper.define_exp(
        use_n_lvl0_agents_in_each_population=hp_lvl1["n_seeds_lvl0"] //
        hp_lvl1["n_seeds_lvl1"],
        train_n_lvl1_agents=hp_lvl1["n_seeds_lvl1"],
        lvl0_checkpoints=lvl0_checkpoints,
    )
    rllib_config_lvl1 = (
        l1br_configuration_helper.prepare_config_for_lvl1_training())

    rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3][
        "working_state"] = "eval_amtft"
    return rllib_config_lvl1

Esempio n. 7

0

Mostra file

File: l1br_lola_pg.py Progetto: longtermrisk/marltoolbox

def get_rllib_config(hp: dict, lvl1_idx: list, lvl1_training: bool):
    assert lvl1_training

    tune_config, _, env_config = get_tune_config(hp=hp)
    tune_config["seed"] = 2020

    stop = {"episodes_total": hp["n_epi"]}

    after_init_fn = functools.partial(
        miscellaneous.sequence_of_fn_wt_same_args,
        function_list=[restore.after_init_load_policy_checkpoint, after_init],
    )

    def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
        return torch.optim.SGD(
            policy.q_func_vars,
            lr=policy.cur_lr,
            momentum=config["sgd_momentum"],
        )

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        stats_fn=log.augment_stats_fn_wt_additionnal_logs(build_q_stats),
        optimizer_fn=sgd_optimizer_dqn,
        after_init=after_init_fn,
    )

    if tune_config["env_class"] in (
            IteratedPrisonersDilemma,
            IteratedBoS,
            IteratedAsymChicken,
            IteratedAsymBoS,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
        })

    elif tune_config["env_class"] in (
            VectorizedCoinGame,
            AsymVectorizedCoinGame,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
            "batch_size": 1,
        })

    else:
        raise ValueError()

    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["env_config"] = env_config
    policies = {}
    for policy_idx, policy_id in enumerate(env_config["players_ids"]):
        if policy_idx not in lvl1_idx:
            policies[policy_id] = (
                policy.get_tune_policy_class(DQNTorchPolicy),
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"],
                    "tune_config": tune_config,
                },
            )
        else:
            policies[policy_id] = (
                MyDQNTorchPolicy,
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"]
                },
            )

    rllib_config = {
        "env":
        tune_config["env_class"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4,
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [64],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [64],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        "gamma":
        hp["gamma"],
        "min_iter_time_s":
        3.0,
        # Can't restaure stuff with search
        # "seed": hp["seed"],
        "seed":
        tune.grid_search(
            hp["lvl1_seeds"] if lvl1_training else hp["lvl0_seeds"]),
        # "evaluation_num_episodes": 100,
        # "evaluation_interval": hparams["n_epi"],
        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [
            (0, hp["base_lr"]),
            (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9),
        ],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            "type":
            exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule":
            hp["temperature_schedule"] or PiecewiseSchedule(
                endpoints=[
                    (0, 10.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1),
                ],
                outside_value=0.1,
                framework="torch",
            ),
        },
        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LE supports only 1 worker only
        # otherwise it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker
        # only otherwise several episodes would be played at the same time
        "num_envs_per_worker":
        1,
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        callbacks.merge_callbacks(
            log.get_logging_callbacks_class(), callbacks.PolicyCallbacks
            # population.PopulationOfIdenticalAlgoCallBacks
        ),
        "log_level":
        "INFO",
    }

    if "CoinGame" in hp["env_name"]:
        rllib_config["model"] = {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
        }

    return stop, env_config, rllib_config

Esempio n. 8

0

Mostra file

def _generate_eval_config(tune_hp, debug):
    rllib_hp = copy.deepcopy(tune_hp)
    rllib_hp["seed"] = 2020
    rllib_hp["num_episodes"] = 1 if debug else 100
    tune_config, stop, env_config = _get_tune_config(rllib_hp,
                                                     stop_on_epi_number=True)
    rllib_hp["env_class"] = tune_config["env_class"]

    if "CoinGame" in tune_config["env_name"]:
        env_config["batch_size"] = 1
        tune_config["TuneTrainerClass"] = train_cg_tune_class_API.LOLAPGCG
    else:
        tune_config["TuneTrainerClass"] = LOLAPGMatrice

    rllib_config_eval = {
        "env": rllib_hp["env_class"],
        "env_config": env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    # The default policy is DQN defined in DQNTrainer
                    # but we overwrite it to use the LE policy
                    policy.get_tune_policy_class(DQNTorchPolicy),
                    rllib_hp["env_class"](env_config).OBSERVATION_SPACE,
                    rllib_hp["env_class"].ACTION_SPACE,
                    {
                        "tune_config": tune_config
                    },
                ),
                env_config["players_ids"][1]: (
                    policy.get_tune_policy_class(DQNTorchPolicy),
                    rllib_hp["env_class"](env_config).OBSERVATION_SPACE,
                    rllib_hp["env_class"].ACTION_SPACE,
                    {
                        "tune_config": tune_config
                    },
                ),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
            "policies_to_train": ["None"],
        },
        "seed": rllib_hp["seed"],
        "min_iter_time_s": 3.0,
        "callbacks": log.get_logging_callbacks_class(log_full_epi=True, ),
    }

    policies_to_load = copy.deepcopy(env_config["players_ids"])

    if "CoinGame" in rllib_hp["env_name"]:
        trainable_class = train_cg_tune_class_API.LOLAPGCG
        rllib_config_eval["model"] = {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
        }
    else:
        trainable_class = LOLAPGMatrice

    return (
        rllib_hp,
        rllib_config_eval,
        policies_to_load,
        trainable_class,
        stop,
        env_config,
    )

Esempio n. 9

0

Mostra file

File: dqn_coin_game.py Progetto: longtermrisk/marltoolbox

def _get_rllib_configs(hp, env_class=None):
    stop_config = {
        "episodes_total": 2 if hp["debug"] else hp["n_epi"],
    }

    env_config = {
        "players_ids": ["player_red", "player_blue"],
        "max_steps": hp["n_steps_per_epi"],
        "grid_size": 3,
        "get_additional_info": True,
    }

    env_class = coin_game.CoinGame if env_class is None else env_class
    rllib_config = {
        "env": env_class,
        "env_config": env_config,

        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    augmented_dqn.MyDQNTorchPolicy,
                    env_class(env_config).OBSERVATION_SPACE,
                    env_class.ACTION_SPACE,
                    {}),
                env_config["players_ids"][1]: (
                    augmented_dqn.MyDQNTorchPolicy,
                    env_class(env_config).OBSERVATION_SPACE,
                    env_class.ACTION_SPACE,
                    {}),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===

        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] * 30)),
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * hp["buf_frac"])),
        # Whether to use dueling dqn
        "dueling": False,
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,

        "rollout_fragment_length": tune.sample_from(
            lambda spec: spec.config["env_config"]["max_steps"]),
        "training_intensity": 10,
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] *
                             hp["bs_epi_mul"])),
        "batch_mode": "complete_episodes",

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            # "type": exploration.SoftQSchedule,
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": tune.sample_from(
                lambda spec: PiecewiseSchedule(
                    endpoints=[
                        (0,
                         2.0),
                        (int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * 0.20),
                         0.5),
                        (int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * 0.60),
                         hp["last_exploration_temp_value"])],
                    outside_value=hp["last_exploration_temp_value"],
                    framework="torch")),
        },

        # Size of batches collected from each worker.
        "model": {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]]
        },
        "gamma": 0.96,
        "optimizer": {"sgd_momentum": 0.9, },
        "lr": 0.1,
        "lr_schedule": tune.sample_from(
            lambda spec: [
                (0, 0.0),
                (int(spec.config["env_config"]["max_steps"] *
                     spec.stop["episodes_total"] * 0.05),
                 spec.config.lr),
                (int(spec.config["env_config"]["max_steps"] *
                     spec.stop["episodes_total"]),
                 spec.config.lr / 1e9)
            ]
        ),

        "seed": tune.grid_search(hp["seeds"]),
        "callbacks": log.get_logging_callbacks_class(),
        "framework": "torch",

        "logger_config": {
            "wandb": {
                "project": "DQN_CG",
                "group": hp["exp_name"],
                "api_key_file":
                    os.path.join(os.path.dirname(__file__),
                                 "../../../api_key_wandb"),
                "log_config": True
            },
        },

    }

    return rllib_config, stop_config

Esempio n. 10

0

Mostra file

File: amtft_various_env.py Progetto: tobiasbaumann1/amd

def get_rllib_config(hp, welfare_fn):
    stop = {
        "episodes_total": hp["n_epi"],
    }

    env_config = get_env_config(hp)
    policies = get_policies(hp, env_config, welfare_fn)

    selected_seeds = hp["seeds"][:hp["train_n_replicates"]]
    hp["seeds"] = hp["seeds"][hp["train_n_replicates"]:]

    trainer_config_update = {
        "env":
        hp["env"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        "gamma":
        hp["gamma"],
        "min_iter_time_s":
        hp["min_iter_time_s"],
        "seed":
        tune.grid_search(selected_seeds),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule":
        [(0, hp["base_lr"]),
         (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"],

        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LE supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker":
        1,

        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        amTFT.get_amTFTCallBacks(additionnal_callbacks=[
            log.get_logging_callbacks_class(),
            # This only overwrite the reward that is used for training not the one in the metrics
            postprocessing.OverwriteRewardWtWelfareCallback
        ]),
        # "log_level": "INFO",
    }

    trainer_config_update.update({
        # === DQN Models ===
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4,
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens":
        hp["hiddens"],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": hp["hiddens"],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type":
            exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule":
            hp["temperature_schedule"] or PiecewiseSchedule(endpoints=[
                (0, 10.0),
                (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0),
                (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1)
            ],
                                                            outside_value=0.1,
                                                            framework="torch"),
        },
    })

    if hp["env"] in [coin_game.CoinGame, coin_game.AsymCoinGame]:
        trainer_config_update["model"] = {
            "dim": env_config["grid_size"],
            "conv_filters": [[16, [3, 3], 1],
                             [32, [3, 3],
                              1]],  # [Channel, [Kernel, Kernel], Stride]]
        }

    return stop, env_config, trainer_config_update

Esempio n. 11

0

Mostra file

File: amtft_various_env.py Progetto: longtermrisk/marltoolbox

def get_rllib_config(hp, welfare_fn, eval=False):
    stop = {
        "episodes_total": hp["n_epi"],
    }

    env_config = get_env_config(hp)
    policies = get_policies(hp, env_config, welfare_fn, eval)

    selected_seeds = hp["seeds"][:hp["train_n_replicates"]]
    hp["seeds"] = hp["seeds"][hp["train_n_replicates"]:]

    rllib_config = {
        "env":
        hp["env_class"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
            # When replay_mode=lockstep, RLlib will replay all the agent
            # transitions at a particular timestep together in a batch.
            # This allows the policy to implement differentiable shared
            # computations between  agents it controls at that timestep.
            # When replay_mode=independent,
            # transitions are replayed independently per policy.
            # "replay_mode": "lockstep",
            "observation_fn": amTFT.observation_fn,
        },
        "gamma":
        hp["gamma"],
        "seed":
        tune.grid_search(selected_seeds),
        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule":
        hp["lr_schedule"],
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        "training_intensity":
        hp["training_intensity"],
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] *
                                                      hp["n_epi"] /
                                                      hp["log_n_points"]),
        "min_iter_time_s":
        0.0,
        # General config
        "framework":
        "torch",
        # LE supports only 1 worker only otherwise
        # it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker only otherwise
        # several episodes would be played at the same time
        "num_envs_per_worker":
        1,
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py` for more usage
        # information.
        "callbacks":
        callbacks.merge_callbacks(
            amTFT.AmTFTCallbacks,
            log.get_logging_callbacks_class(log_full_epi=True,
                                            log_full_epi_interval=100),
        ),
        "logger_config": {
            "wandb": {
                "project":
                "amTFT",
                "group":
                hp["exp_name"],
                "api_key_file":
                os.path.join(os.path.dirname(__file__),
                             "../../../api_key_wandb"),
                "log_config":
                True,
            },
        },
        # === DQN Models ===
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["target_network_update_freq"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5),
        # Whether to use dueling dqn
        "dueling":
        True,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens":
        hp["hiddens"],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": hp["hiddens"],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": hp["temperature_schedule"],
        },
    }

    if "CoinGame" in hp["env_name"]:
        rllib_config["model"] = {
            "dim": env_config["grid_size"],
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
            # [Channel, [Kernel, Kernel], Stride]]
        }

    return stop, env_config, rllib_config

Esempio n. 12

0

Mostra file

def get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],  # 4000 steps in 200 epi
    }

    env_config = {
        "players_ids": ["player_row", "player_col"],
        "max_steps": hp["n_steps_per_epi"],
    }

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        optimizer_fn=sgd_optimizer_dqn,
        stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))

    ltft_config = merge_dicts(
        LTFT_DEFAULT_CONFIG_UPDATE,
        {
            "sgd_momentum": 0.9,
            'nested_policies': [
                # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": {
                    "learn_action": True,
                    "learn_reward": False,
                    "sgd_momentum": 0.75,
                    "explore": False,
                    "timesteps_per_iteration": hp["n_steps_per_epi"],
                    # === Optimization ===
                    # Learning rate for adam optimizer
                    "lr": hp["base_lr"] * hp["spl_lr_mul"],
                    # Learning rate schedule
                    "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]),
                                    (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
                    "loss_fn": torch.nn.CrossEntropyLoss(
                        weight=None,
                        size_average=None,
                        ignore_index=-100,
                        reduce=None,
                        reduction='mean')
                }},
            ],
        }
    )

    MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper(
        IteratedPrisonersDilemma,
        reward_uncertainty_std=0.1)

    rllib_config = {
        "env": MyUncertainIPD,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "player_row": (
                    # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
                "player_col": (
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]),
        # Whether to use dueling dqn
        "dueling": False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [4],
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [4, 2],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        "gamma": 0.5,
        "min_iter_time_s": 0.33,
        "seed": tune.grid_search(hp["seeds"]),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr": hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [(0, hp["base_lr"]),
                        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip": 1,
        # How many steps of the model to sample before learning starts.
        "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length": hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": PiecewiseSchedule(
                endpoints=[
                    (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)],
                outside_value=0.1,
                framework="torch")
        },

        # General config
        "framework": "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers": 0,
        # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker": 1,
        "batch_mode": "complete_episodes",

        # # === Debug Settings ===
        # # Whether to write episode stats and videos to the agent log dir. This is
        # # typically located in ~/ray_results.
        # "monitor": True,
        # # Set the ray.rllib.* log level for the agent process and its workers.
        # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
        # # periodically print out summaries of relevant internal dataflow (this is
        # # also printed out once at startup at the INFO level). When using the
        # # `rllib train` command, you can also use the `-v` and `-vv` flags as
        # # shorthand for INFO and DEBUG.
        # "log_level": "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        # "callbacks": DefaultCallbacks,
        "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks,
                                                   log.get_logging_callbacks_class()),
        # # Whether to attempt to continue training if a worker crashes. The number
        # # of currently healthy workers is reported as the "num_healthy_workers"
        # # metric.
        # "ignore_worker_failures": False,
        # # Log system resource metrics to results. This requires `psutil` to be
        # # installed for sys stats, and `gputil` for GPU metrics.
        # "log_sys_usage": True,
        # # Use fake (infinite speed) sampler. For testing only.
        # "fake_sampler": False,
    }

    return rllib_config, env_config, stop

Esempio n. 13

0

Mostra file

def _get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],
    }

    env_config = _get_env_config(hp)

    my_uncertain_env_class = add_RewardUncertaintyEnvClassWrapper(
        hp["env_class"], reward_uncertainty_std=hp["reward_uncertainty_std"])

    rllib_config = copy.deepcopy(ltft.DEFAULT_CONFIG)
    rllib_config.update({
        "env":
        my_uncertain_env_class,
        "env_config":
        env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    None,
                    hp["env_class"]({}).OBSERVATION_SPACE,
                    hp["env_class"].ACTION_SPACE,
                    {},
                ),
                env_config["players_ids"][1]: (
                    None,
                    hp["env_class"]({}).OBSERVATION_SPACE,
                    hp["env_class"].ACTION_SPACE,
                    {},
                ),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
            # When replay_mode=lockstep, RLlib will replay all the agent
            # transitions at a particular timestep together in a batch.
            # This allows the policy to implement differentiable shared
            # computations between agents it controls at that timestep. When
            # replay_mode=independent,
            # transitions are replayed independently per policy.
            # "replay_mode": "lockstep",
            "observation_fn": ltft.observation_fn,
        },
        # === DQN Models ===
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        30 * hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5),
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens":
        hp["hiddens"],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": hp["hiddens"],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            "type": exploration.SoftQScheduleWtClustering,
            # Add constructor kwargs here (if any).
            "temperature_schedule": hp["temperature_schedule"],
            "clustering_distance": hp["clustering_distance"],
        },
        "gamma":
        hp["gamma"],
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] *
                                                      hp["n_epi"] /
                                                      hp["log_n_points"]),
        "min_iter_time_s":
        0.0,
        "seed":
        tune.grid_search(hp["seeds"]),
        # === Optimization ===
        "optimizer": {
            "sgd_momentum": hp["sgd_momentum"],
        },
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule":
        hp["lr_schedule"],
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        "training_intensity":
        hp["training_intensity"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFTTorchPolicy supports only 1 worker only otherwise
        # it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LTFTTorchPolicy supports only 1 env per worker only
        # otherwise several episodes would be played at the same
        # time
        "num_envs_per_worker":
        1,
        "batch_mode":
        "complete_episodes",
        "logger_config": {
            "wandb": {
                "project":
                "LTFT",
                "group":
                hp["exp_name"],
                "api_key_file":
                os.path.join(os.path.dirname(__file__),
                             "../../../api_key_wandb"),
                "log_config":
                True,
            },
        },
        # === Debug Settings ===
        "log_level":
        "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        callbacks.merge_callbacks(
            ltft.LTFTCallbacks,
            log.get_logging_callbacks_class(log_full_epi=True, ),
        ),
    })

    hp, rllib_config, env_config, stop = _modify_config_for_coin_game(
        hp, rllib_config, env_config, stop)

    nested_policies_config = rllib_config["nested_policies"]
    nested_spl_policy_config = nested_policies_config[3]["config_update"]
    nested_spl_policy_config["train_batch_size"] = (int(
        hp["n_steps_per_epi"] * hp["bs_epi_mul_spl"]), )
    rllib_config["nested_policies"] = nested_policies_config

    return rllib_config, env_config, stop