def get_rllib_config(hp: dict, lvl1_idx: list, lvl1_training: bool): assert lvl1_training tune_config, _, env_config = get_tune_config(hp=hp) tune_config["seed"] = 2020 stop = {"episodes_total": hp["n_epi"]} after_init_fn = functools.partial( miscellaneous.sequence_of_fn_wt_same_args, function_list=[restore.after_init_load_policy_checkpoint, after_init], ) def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer": return torch.optim.SGD( policy.q_func_vars, lr=policy.cur_lr, momentum=config["sgd_momentum"], ) MyDQNTorchPolicy = DQNTorchPolicy.with_updates( stats_fn=log.augment_stats_fn_wt_additionnal_logs(build_q_stats), optimizer_fn=sgd_optimizer_dqn, after_init=after_init_fn, ) if tune_config["env_class"] in ( IteratedPrisonersDilemma, IteratedBoS, IteratedAsymChicken, IteratedAsymBoS, ): env_config.update({ "max_steps": hp["n_steps_per_epi"], }) elif tune_config["env_class"] in ( VectorizedCoinGame, AsymVectorizedCoinGame, ): env_config.update({ "max_steps": hp["n_steps_per_epi"], "batch_size": 1, }) else: raise ValueError() tune_config["TuneTrainerClass"] = hp["tune_class"] tune_config["TuneTrainerClass"] = hp["tune_class"] tune_config["env_config"] = env_config policies = {} for policy_idx, policy_id in enumerate(env_config["players_ids"]): if policy_idx not in lvl1_idx: policies[policy_id] = ( policy.get_tune_policy_class(DQNTorchPolicy), tune_config["env_class"](env_config).OBSERVATION_SPACE, tune_config["env_class"].ACTION_SPACE, { "sgd_momentum": hp["sgd_momentum"], "tune_config": tune_config, }, ) else: policies[policy_id] = ( MyDQNTorchPolicy, tune_config["env_class"](env_config).OBSERVATION_SPACE, tune_config["env_class"].ACTION_SPACE, { "sgd_momentum": hp["sgd_momentum"] }, ) rllib_config = { "env": tune_config["env_class"], "env_config": env_config, "multiagent": { "policies": policies, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4, # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [64], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": [64], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, "gamma": hp["gamma"], "min_iter_time_s": 3.0, # Can't restaure stuff with search # "seed": hp["seed"], "seed": tune.grid_search( hp["lvl1_seeds"] if lvl1_training else hp["lvl0_seeds"]), # "evaluation_num_episodes": 100, # "evaluation_interval": hparams["n_epi"], # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [ (0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9), ], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"] or PiecewiseSchedule( endpoints=[ (0, 10.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1), ], outside_value=0.1, framework="torch", ), }, # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LE supports only 1 worker only # otherwise it would be mixing several opponents trajectories "num_workers": 0, # LE supports only 1 env per worker # only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and # `examples/custom_metrics_and_callbacks.py` # for more usage information. "callbacks": callbacks.merge_callbacks( log.get_logging_callbacks_class(), callbacks.PolicyCallbacks # population.PopulationOfIdenticalAlgoCallBacks ), "log_level": "INFO", } if "CoinGame" in hp["env_name"]: rllib_config["model"] = { "dim": env_config["grid_size"], # [Channel, [Kernel, Kernel], Stride]] "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], } return stop, env_config, rllib_config
# Trainer config using Rainbow DQN with HER HER_RAINBOW_DQN_CONFIG = DEFAULT_CONFIG.copy() HER_RAINBOW_DQN_CONFIG.update({ # Hindsight Experience Replay "batch_mode": "complete_episodes", # postprocess with full trajectory "num_her_traj": 6, # number of new trajectories sampled using HER # Rainbow DQN Config "n_step": 1, # n_step TD "noisy": True, # noisy network "num_atoms": 1, # number of distributional buckets "v_min": -10.0, "v_max": 10.0 }) HERRainbowTrainer = build_trainer(name="HER_RainbowDQN", default_policy=DQNTorchPolicy.with_updates( postprocess_fn=postprocess_with_HER), default_config=HER_RAINBOW_DQN_CONFIG) if __name__ == "__main__": ray.init() parser = argparse.ArgumentParser() parser.add_argument("--steps", type=int, default=1000000) args = parser.parse_args() tune.run(HERRainbowTrainer, config={ "env": "CartPole-v1", "num_workers": 1, "num_gpus": 1, }, stop={ "timesteps_total": args.steps,
entropy_avg, entropy_single = log._compute_entropy_from_raw_q_values( policy, policy.last_q_t.clone() ) return dict( { "entropy_avg": entropy_avg, "cur_lr": policy.cur_lr, }, **policy.q_loss.stats, ) MyDQNTorchPolicy = DQNTorchPolicy.with_updates( optimizer_fn=optimizers.sgd_optimizer_dqn, loss_fn=build_q_losses_wt_additional_logs, stats_fn=log.augment_stats_fn_wt_additionnal_logs( build_q_stats_wt_addtional_log ), before_init=policy.my_setup_early_mixins, mixins=[ TargetNetworkMixin, ComputeTDErrorMixin, policy.MyLearningRateSchedule, ], ) MyAdamDQNTorchPolicy = MyDQNTorchPolicy.with_updates( optimizer_fn=optimizers.adam_optimizer_dqn, )
def get_rllib_config(hp: dict): stop = { "episodes_total": hp["n_epi"], # 4000 steps in 200 epi } env_config = { "players_ids": ["player_row", "player_col"], "max_steps": hp["n_steps_per_epi"], } MyDQNTorchPolicy = DQNTorchPolicy.with_updates( optimizer_fn=sgd_optimizer_dqn, stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats)) ltft_config = merge_dicts( LTFT_DEFAULT_CONFIG_UPDATE, { "sgd_momentum": 0.9, 'nested_policies': [ # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": { "learn_action": True, "learn_reward": False, "sgd_momentum": 0.75, "explore": False, "timesteps_per_iteration": hp["n_steps_per_epi"], # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"] * hp["spl_lr_mul"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], "loss_fn": torch.nn.CrossEntropyLoss( weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean') }}, ], } ) MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper( IteratedPrisonersDilemma, reward_uncertainty_std=0.1) rllib_config = { "env": MyUncertainIPD, "env_config": env_config, "multiagent": { "policies": { "player_row": ( # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), "player_col": ( LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]), # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [4], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": [4, 2], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, "gamma": 0.5, "min_iter_time_s": 0.33, "seed": tune.grid_search(hp["seeds"]), # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": PiecewiseSchedule( endpoints=[ (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)], outside_value=0.1, framework="torch") }, # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories "num_workers": 0, # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, "batch_mode": "complete_episodes", # # === Debug Settings === # # Whether to write episode stats and videos to the agent log dir. This is # # typically located in ~/ray_results. # "monitor": True, # # Set the ray.rllib.* log level for the agent process and its workers. # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also # # periodically print out summaries of relevant internal dataflow (this is # # also printed out once at startup at the INFO level). When using the # # `rllib train` command, you can also use the `-v` and `-vv` flags as # # shorthand for INFO and DEBUG. # "log_level": "INFO", # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py` # for more usage information. # "callbacks": DefaultCallbacks, "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks, log.get_logging_callbacks_class()), # # Whether to attempt to continue training if a worker crashes. The number # # of currently healthy workers is reported as the "num_healthy_workers" # # metric. # "ignore_worker_failures": False, # # Log system resource metrics to results. This requires `psutil` to be # # installed for sys stats, and `gputil` for GPU metrics. # "log_sys_usage": True, # # Use fake (infinite speed) sampler. For testing only. # "fake_sampler": False, } return rllib_config, env_config, stop
def after_init(policy, obs_space, action_space, config): # ComputeTDErrorMixin.__init__(policy) RainbowComputeTDErrorMixin.__init__(policy) TargetNetworkMixin.__init__(policy, obs_space, action_space, config) # Move target net to device (this is done autoatically for the # policy.model, but not for any other models the policy has). policy.target_q_model = policy.target_q_model.to(policy.device) ####################################################################################################### ##################################### Policy ##################################################### ####################################################################################################### # hack to avoid cycle imports import algorithms.baselines.rainbow.rainbow_trainer BaselineRainbowTorchPolicy = DQNTorchPolicy.with_updates( name="BaselineRainbowTorchPolicy", loss_fn=build_rainbow_q_losses, make_model_and_action_dist=build_q_model_and_distribution, action_distribution_fn=get_distribution_inputs_and_class, # get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, get_default_config=lambda: algorithms.baselines.rainbow.rainbow_trainer. RAINBOW_CONFIG, after_init=after_init, mixins=[ TargetNetworkMixin, RainbowComputeTDErrorMixin, LearningRateSchedule, ])
# ============= Exploration ============= "explore": True, "exploration_config": { # Exploration sub-class by name or full path to module+class # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”) "type": "EpsilonGreedy", # Parameters for the Exploration class' constructor: # "initial_epsilon": 1.0, # "final_epsilon": 0.1, # "epsilon_timesteps": 800_000, # Timesteps over which to anneal epsilon. }, }) # step 3: build policy with HER postprocess function HERRainbowPolicy = DQNTorchPolicy.with_updates( postprocess_fn=postprocess_with_HER, get_default_config=lambda: HER_RAINBOW_DQN_CONFIG) # step 4: build off-policy HER trainer using off-policy execution plan from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.dqn.dqn import validate_config, execution_plan HERRainbowTrainer = build_trainer( name= f"{'' if HER_RAINBOW_DQN_CONFIG['use_HER'] else 'NO'}HER_RainbowDQN_16x16", default_policy=HERRainbowPolicy, default_config=HER_RAINBOW_DQN_CONFIG, validate_config=validate_config, execution_plan=execution_plan) if __name__ == "__main__": ray.init()
def get_distribution_inputs_and_class(policy, model, obs_batch, *, explore=True, is_training=False, **kwargs): q_vals = compute_q_values(policy, model, obs_batch, explore, is_training) q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals policy.q_values = q_vals return policy.q_values, TorchCategorical, [] # state-out ####################################################################################################### ##################################### Policy ##################################################### ####################################################################################################### # hack to avoid cycle imports import algorithms.baselines.dqn.dqn_trainer BaselineDQNTorchPolicy = DQNTorchPolicy.with_updates( name="BaselineDQNTorchPolicy", make_model_and_action_dist=build_q_model_and_distribution, action_distribution_fn=get_distribution_inputs_and_class, # get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, get_default_config=lambda: algorithms.baselines.dqn.dqn_trainer.DQN_CONFIG )
from ray.rllib.agents.dqn import ApexTrainer from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy import ray from ray import tune from ray.tune import register_env from scripts.models import loss_callback, custom_eval_fn def my_get_policy(*args, **kwargs): print(f'GET POLICY:\n{args=}\n{kwargs=}\n') return MyPolicy MyPolicy = DQNTorchPolicy.with_updates( name='MyPolicy', loss_fn=loss_callback, ) MyTrainer = ApexTrainer.with_updates( name='MyDQN', get_policy_class=my_get_policy, default_policy=MyPolicy, )