def get_rllib_config(hp: dict, lvl1_idx: list, lvl1_training: bool):
    assert lvl1_training

    tune_config, _, env_config = get_tune_config(hp=hp)
    tune_config["seed"] = 2020

    stop = {"episodes_total": hp["n_epi"]}

    after_init_fn = functools.partial(
        miscellaneous.sequence_of_fn_wt_same_args,
        function_list=[restore.after_init_load_policy_checkpoint, after_init],
    )

    def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
        return torch.optim.SGD(
            policy.q_func_vars,
            lr=policy.cur_lr,
            momentum=config["sgd_momentum"],
        )

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        stats_fn=log.augment_stats_fn_wt_additionnal_logs(build_q_stats),
        optimizer_fn=sgd_optimizer_dqn,
        after_init=after_init_fn,
    )

    if tune_config["env_class"] in (
            IteratedPrisonersDilemma,
            IteratedBoS,
            IteratedAsymChicken,
            IteratedAsymBoS,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
        })

    elif tune_config["env_class"] in (
            VectorizedCoinGame,
            AsymVectorizedCoinGame,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
            "batch_size": 1,
        })

    else:
        raise ValueError()

    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["env_config"] = env_config
    policies = {}
    for policy_idx, policy_id in enumerate(env_config["players_ids"]):
        if policy_idx not in lvl1_idx:
            policies[policy_id] = (
                policy.get_tune_policy_class(DQNTorchPolicy),
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"],
                    "tune_config": tune_config,
                },
            )
        else:
            policies[policy_id] = (
                MyDQNTorchPolicy,
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"]
                },
            )

    rllib_config = {
        "env":
        tune_config["env_class"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4,
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [64],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [64],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        "gamma":
        hp["gamma"],
        "min_iter_time_s":
        3.0,
        # Can't restaure stuff with search
        # "seed": hp["seed"],
        "seed":
        tune.grid_search(
            hp["lvl1_seeds"] if lvl1_training else hp["lvl0_seeds"]),
        # "evaluation_num_episodes": 100,
        # "evaluation_interval": hparams["n_epi"],
        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [
            (0, hp["base_lr"]),
            (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9),
        ],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            "type":
            exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule":
            hp["temperature_schedule"] or PiecewiseSchedule(
                endpoints=[
                    (0, 10.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1),
                ],
                outside_value=0.1,
                framework="torch",
            ),
        },
        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LE supports only 1 worker only
        # otherwise it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker
        # only otherwise several episodes would be played at the same time
        "num_envs_per_worker":
        1,
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        callbacks.merge_callbacks(
            log.get_logging_callbacks_class(), callbacks.PolicyCallbacks
            # population.PopulationOfIdenticalAlgoCallBacks
        ),
        "log_level":
        "INFO",
    }

    if "CoinGame" in hp["env_name"]:
        rllib_config["model"] = {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
        }

    return stop, env_config, rllib_config
Exemple #2
0
# Trainer config using Rainbow DQN with HER
HER_RAINBOW_DQN_CONFIG = DEFAULT_CONFIG.copy()
HER_RAINBOW_DQN_CONFIG.update({
    # Hindsight Experience Replay
    "batch_mode": "complete_episodes",  # postprocess with full trajectory
    "num_her_traj": 6,  # number of new trajectories sampled using HER
    # Rainbow DQN Config
    "n_step": 1,  # n_step TD
    "noisy": True,  # noisy network
    "num_atoms": 1,  # number of distributional buckets
    "v_min": -10.0,
    "v_max": 10.0
})

HERRainbowTrainer = build_trainer(name="HER_RainbowDQN",
                                  default_policy=DQNTorchPolicy.with_updates(
                                      postprocess_fn=postprocess_with_HER),
                                  default_config=HER_RAINBOW_DQN_CONFIG)

if __name__ == "__main__":
    ray.init()
    parser = argparse.ArgumentParser()
    parser.add_argument("--steps", type=int, default=1000000)
    args = parser.parse_args()
    tune.run(HERRainbowTrainer,
             config={
                 "env": "CartPole-v1",
                 "num_workers": 1,
                 "num_gpus": 1,
             },
             stop={
                 "timesteps_total": args.steps,
Exemple #3
0
    entropy_avg, entropy_single = log._compute_entropy_from_raw_q_values(
        policy, policy.last_q_t.clone()
    )

    return dict(
        {
            "entropy_avg": entropy_avg,
            "cur_lr": policy.cur_lr,
        },
        **policy.q_loss.stats,
    )


MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
    optimizer_fn=optimizers.sgd_optimizer_dqn,
    loss_fn=build_q_losses_wt_additional_logs,
    stats_fn=log.augment_stats_fn_wt_additionnal_logs(
        build_q_stats_wt_addtional_log
    ),
    before_init=policy.my_setup_early_mixins,
    mixins=[
        TargetNetworkMixin,
        ComputeTDErrorMixin,
        policy.MyLearningRateSchedule,
    ],
)

MyAdamDQNTorchPolicy = MyDQNTorchPolicy.with_updates(
    optimizer_fn=optimizers.adam_optimizer_dqn,
)
Exemple #4
0
def get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],  # 4000 steps in 200 epi
    }

    env_config = {
        "players_ids": ["player_row", "player_col"],
        "max_steps": hp["n_steps_per_epi"],
    }

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        optimizer_fn=sgd_optimizer_dqn,
        stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))

    ltft_config = merge_dicts(
        LTFT_DEFAULT_CONFIG_UPDATE,
        {
            "sgd_momentum": 0.9,
            'nested_policies': [
                # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": {
                    "learn_action": True,
                    "learn_reward": False,
                    "sgd_momentum": 0.75,
                    "explore": False,
                    "timesteps_per_iteration": hp["n_steps_per_epi"],
                    # === Optimization ===
                    # Learning rate for adam optimizer
                    "lr": hp["base_lr"] * hp["spl_lr_mul"],
                    # Learning rate schedule
                    "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]),
                                    (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
                    "loss_fn": torch.nn.CrossEntropyLoss(
                        weight=None,
                        size_average=None,
                        ignore_index=-100,
                        reduce=None,
                        reduction='mean')
                }},
            ],
        }
    )

    MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper(
        IteratedPrisonersDilemma,
        reward_uncertainty_std=0.1)

    rllib_config = {
        "env": MyUncertainIPD,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "player_row": (
                    # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
                "player_col": (
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]),
        # Whether to use dueling dqn
        "dueling": False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [4],
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [4, 2],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        "gamma": 0.5,
        "min_iter_time_s": 0.33,
        "seed": tune.grid_search(hp["seeds"]),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr": hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [(0, hp["base_lr"]),
                        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip": 1,
        # How many steps of the model to sample before learning starts.
        "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length": hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": PiecewiseSchedule(
                endpoints=[
                    (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)],
                outside_value=0.1,
                framework="torch")
        },

        # General config
        "framework": "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers": 0,
        # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker": 1,
        "batch_mode": "complete_episodes",

        # # === Debug Settings ===
        # # Whether to write episode stats and videos to the agent log dir. This is
        # # typically located in ~/ray_results.
        # "monitor": True,
        # # Set the ray.rllib.* log level for the agent process and its workers.
        # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
        # # periodically print out summaries of relevant internal dataflow (this is
        # # also printed out once at startup at the INFO level). When using the
        # # `rllib train` command, you can also use the `-v` and `-vv` flags as
        # # shorthand for INFO and DEBUG.
        # "log_level": "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        # "callbacks": DefaultCallbacks,
        "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks,
                                                   log.get_logging_callbacks_class()),
        # # Whether to attempt to continue training if a worker crashes. The number
        # # of currently healthy workers is reported as the "num_healthy_workers"
        # # metric.
        # "ignore_worker_failures": False,
        # # Log system resource metrics to results. This requires `psutil` to be
        # # installed for sys stats, and `gputil` for GPU metrics.
        # "log_sys_usage": True,
        # # Use fake (infinite speed) sampler. For testing only.
        # "fake_sampler": False,
    }

    return rllib_config, env_config, stop
def after_init(policy, obs_space, action_space, config):
    # ComputeTDErrorMixin.__init__(policy)
    RainbowComputeTDErrorMixin.__init__(policy)
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
    # Move target net to device (this is done autoatically for the
    # policy.model, but not for any other models the policy has).
    policy.target_q_model = policy.target_q_model.to(policy.device)


#######################################################################################################
#####################################   Policy   #####################################################
#######################################################################################################

# hack to avoid cycle imports
import algorithms.baselines.rainbow.rainbow_trainer

BaselineRainbowTorchPolicy = DQNTorchPolicy.with_updates(
    name="BaselineRainbowTorchPolicy",
    loss_fn=build_rainbow_q_losses,
    make_model_and_action_dist=build_q_model_and_distribution,
    action_distribution_fn=get_distribution_inputs_and_class,
    # get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    get_default_config=lambda: algorithms.baselines.rainbow.rainbow_trainer.
    RAINBOW_CONFIG,
    after_init=after_init,
    mixins=[
        TargetNetworkMixin,
        RainbowComputeTDErrorMixin,
        LearningRateSchedule,
    ])
Exemple #6
0
    # ============= Exploration =============
    "explore": True,
    "exploration_config": {
        # Exploration sub-class by name or full path to module+class
        # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”)
        "type": "EpsilonGreedy",
        # Parameters for the Exploration class' constructor:
        # "initial_epsilon": 1.0,
        # "final_epsilon": 0.1,
        # "epsilon_timesteps": 800_000,  # Timesteps over which to anneal epsilon.
    },
})

# step 3: build policy with HER postprocess function
HERRainbowPolicy = DQNTorchPolicy.with_updates(
    postprocess_fn=postprocess_with_HER,
    get_default_config=lambda: HER_RAINBOW_DQN_CONFIG)

# step 4: build off-policy HER trainer using off-policy execution plan
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.dqn.dqn import validate_config, execution_plan
HERRainbowTrainer = build_trainer(
    name=
    f"{'' if HER_RAINBOW_DQN_CONFIG['use_HER'] else 'NO'}HER_RainbowDQN_16x16",
    default_policy=HERRainbowPolicy,
    default_config=HER_RAINBOW_DQN_CONFIG,
    validate_config=validate_config,
    execution_plan=execution_plan)

if __name__ == "__main__":
    ray.init()
Exemple #7
0

def get_distribution_inputs_and_class(policy,
                                      model,
                                      obs_batch,
                                      *,
                                      explore=True,
                                      is_training=False,
                                      **kwargs):
    q_vals = compute_q_values(policy, model, obs_batch, explore, is_training)
    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals

    policy.q_values = q_vals
    return policy.q_values, TorchCategorical, []  # state-out


#######################################################################################################
#####################################   Policy   #####################################################
#######################################################################################################

# hack to avoid cycle imports 
import algorithms.baselines.dqn.dqn_trainer

BaselineDQNTorchPolicy = DQNTorchPolicy.with_updates(
    name="BaselineDQNTorchPolicy",
    make_model_and_action_dist=build_q_model_and_distribution,
    action_distribution_fn=get_distribution_inputs_and_class,
    # get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    get_default_config=lambda: algorithms.baselines.dqn.dqn_trainer.DQN_CONFIG
)
from ray.rllib.agents.dqn import ApexTrainer
from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy

import ray
from ray import tune
from ray.tune import register_env

from scripts.models import loss_callback, custom_eval_fn


def my_get_policy(*args, **kwargs):
    print(f'GET POLICY:\n{args=}\n{kwargs=}\n')
    return MyPolicy


MyPolicy = DQNTorchPolicy.with_updates(
    name='MyPolicy',
    loss_fn=loss_callback,
)

MyTrainer = ApexTrainer.with_updates(
    name='MyDQN',
    get_policy_class=my_get_policy,
    default_policy=MyPolicy,
)