class HParams(FCNet.HParams, OutputHead.HParams): """ Hyper-parameters of the OutputHead used for classification. """ # NOTE: These hparams were basically copied over from FCNet.HParams, just so its a # bit more visible. available_activations: ClassVar[Dict[str, Type[nn.Module]]] = { "relu": nn.ReLU, "tanh": nn.Tanh, "elu": nn.ELU, # No idea what these do, but hey, they are available! "gelu": nn.GELU, "relu6": nn.ReLU6, } # Number of hidden layers in the output head. hidden_layers: int = uniform(0, 3, default=0) # Number of neurons in each hidden layer of the output head. # If a single value is given, than each of the `hidden_layers` layers # will have that number of neurons. # If `n > 1` values are given, then `hidden_layers` must either be 0 or # `n`, otherwise a RuntimeError will be raised. hidden_neurons: Union[int, List[int]] = uniform(16, 512, default=64) activation: Type[nn.Module] = categorical(available_activations, default=nn.Tanh) # Dropout probability. Dropout is applied after each layer. # Set to None or 0 for no dropout. # TODO: Not sure if this is how it's typically used. Need to check. dropout_prob: Optional[float] = uniform(0, 0.8, default=0.2)
class HParams(HyperParameters): """ Hyper-parameters of the Settings. """ # Learning rate of the optimizer. learning_rate: float = log_uniform(1e-6, 1e-2, default=0.001) # Batch size batch_size: int = categorical(16, 32, 64, 128, default=128) # weight/importance of the task embedding to the gate function s_hat: float = uniform(1.0, 100.0, default=50.0) # Maximum number of training epochs per task max_epochs_per_task: int = uniform(1, 20, default=10, discrete=True)
class HParams(SB3BaseHParams): """ Hyper-parameters of the A2C Model. TODO: Set actual 'good' priors for these hyper-parameters, as these were set somewhat randomly. """ # Discount factor gamma: float = uniform(0.9, 0.9999, default=0.99) # Factor for trade-off of bias vs variance for Generalized Advantage Estimator. # Equivalent to classic advantage when set to 1. gae_lambda: float = uniform(0.5, 1.0, default=1.0) # Entropy coefficient for the loss calculation ent_coef: float = uniform(0.0, 1.0, default=0.0) # Value function coefficient for the loss calculation vf_coef: float = uniform(0.01, 1.0, default=0.5) # The maximum value for the gradient clipping max_grad_norm: float = uniform(0.1, 10, default=0.5) # RMSProp epsilon. It stabilizes square root computation in denominator of # RMSProp update. rms_prop_eps: float = log_uniform(1e-7, 1e-3, default=1e-5) # :param use_rms_prop: Whether to use RMSprop (default) or Adam as optimizer use_rms_prop: bool = categorical(True, False, default=True) # Whether to use generalized State Dependent Exploration (gSDE) instead of # action noise exploration (default: False) use_sde: bool = categorical(True, False, default=False) # Sample a new noise matrix every n steps when using gSDE. # Default: -1 (only sample at the beginning of the rollout) sde_sample_freq: int = categorical(-1, 1, 5, 10, default=-1) # Whether to normalize or not the advantage normalize_advantage: bool = categorical(True, False, default=False) # The log location for tensorboard (if None, no logging) tensorboard_log: Optional[str] = None # # Whether to create a second environment that will be used for evaluating the # # agent periodically. (Only available when passing string for the environment) # create_eval_env: bool = False # # Additional arguments to be passed to the policy on creation # policy_kwargs: Optional[Dict[str, Any]] = None # The verbosity level: 0 no output, 1 info, 2 debug verbose: int = 0 # Seed for the pseudo random generators seed: Optional[int] = None # Device (cpu, cuda, ...) on which the code should be run. # Setting it to auto, the code will be run on the GPU if possible. device: Union[torch.device, str] = "auto"
class HParams(HyperParameters): """ Hyper-parameters of a fully-connected network. """ available_activations: ClassVar[Dict[str, Type[nn.Module]]] = { "relu": nn.ReLU, "tanh": nn.Tanh, "elu": nn.ELU, # No idea what these do, but hey, they are available! "gelu": nn.GELU, "relu6": nn.ReLU6, } # Number of hidden layers in the output head. hidden_layers: int = uniform(0, 10, default=3) # Number of neurons in each hidden layer of the output head. # If a single value is given, than each of the `hidden_layers` layers # will have that number of neurons. # If `n > 1` values are given, then `hidden_layers` must either be 0 or # `n`, otherwise a RuntimeError will be raised. hidden_neurons: Union[int, List[int]] = uniform(16, 512, default=64) activation: Type[nn.Module] = categorical(available_activations, default=nn.Tanh) # Dropout probability. Dropout is applied after each layer. # Set to None or 0 for no dropout. # TODO: Not sure if this is how it's typically used. Need to check. dropout_prob: Optional[float] = uniform(0, 0.8, default=0.2) def __post_init__(self): super().__post_init__() if isinstance(self.activation, str): self.activation = self.available_activations[ self.activation.lower()] if isinstance(self.hidden_neurons, int): self.hidden_neurons = [self.hidden_neurons] # no value passed to --hidden_layers if self.hidden_layers == 0: if len(self.hidden_neurons) == 1: # Default Setting: No hidden layers. self.hidden_neurons = [] elif len(self.hidden_neurons) > 1: # Set the number of hidden layers to the number of passed values. self.hidden_layers = len(self.hidden_neurons) elif self.hidden_layers > 0 and len(self.hidden_neurons) == 1: # Duplicate that value for each of the `hidden_layers` layers. self.hidden_neurons *= self.hidden_layers elif self.hidden_layers == 1 and not self.hidden_neurons: self.hidden_layers = 0 if self.hidden_layers != len(self.hidden_neurons): raise RuntimeError( f"Invalid values: hidden_layers ({self.hidden_layers}) != " f"len(hidden_neurons) ({len(self.hidden_neurons)}).")
class HParams(HyperParameters): """ Hyper-parameters of the Pnn method. """ # Learning rate of the optimizer. Defauts to 0.0001 when in SL. learning_rate: float = log_uniform(1e-6, 1e-2, default=2e-4) num_steps: int = 200 # (only applicable in RL settings.) # Discount factor (Only used in RL settings). gamma: float = uniform(0.9, 0.999, default=0.99) # Number of hidden units (only used in RL settings.) hidden_size: int = categorical(64, 128, 256, default=256) # Batch size in SL, and number of parallel environments in RL. # Defaults to None in RL, and 32 when in SL. batch_size: Optional[int] = None # Maximum number of training epochs per task. (only used in SL Settings) max_epochs_per_task: int = uniform(1, 20, default=10)
class HParams(PolicyHead.HParams): """ Hyper-parameters of the episodic A2C output head. """ # Wether to normalize the advantages for each episode. normalize_advantages: bool = categorical(True, False, default=False) actor_loss_coef: float = uniform(0.1, 1, default=0.5) critic_loss_coef: float = uniform(0.1, 1, default=0.5) entropy_loss_coef: float = uniform(0, 1, default=0.1) # Maximum norm of the policy gradient. max_policy_grad_norm: Optional[float] = None # The discount factor. gamma: float = uniform(0.9, 0.999, default=0.99)
class Options(AuxiliaryTask.Options): """ Options of the EWC auxiliary task. """ # Coefficient of the EWC auxilary task. # NOTE: It seems to be the case that, at least just for EWC, the coefficient # can be often be much greater than 1, hence why we overwrite the prior over # that hyper-parameter here. coefficient: float = uniform(0.0, 100.0, default=1.0) # Batchsize to be used when computing FIM (unused atm) batch_size_fim: int = 32 # Number of observations to use for FIM calculation sample_size_fim: int = categorical(2, 4, 8, 16, 32, 64, 128, 256, 512, default=8) # Fisher information representation type (diagonal or block diagonal). fim_representation: Type[PMatAbstract] = choice( { "diagonal": PMatDiag, "block_diagonal": PMatKFAC }, default=PMatDiag, )
class HParams(MultiHeadClassifier.HParams): """ Hyperparameters of this improved method. Adds the hyper-parameters related the 'ewc-like' regularization to those of the ExampleMethod. NOTE: These `uniform()` and `log_uniform` and `HyperParameters` are just there to make it easier to run HPO sweeps for your Method, which isn't required for the competition. """ # Coefficient of the ewc-like loss. reg_coefficient: float = uniform(0.0, 10.0, default=1.0) # Distance norm used in the regularization loss. reg_p_norm: int = 2
class HParams(SB3BaseHParams): """ Hyper-parameters of the PPO Model. """ # # The policy model to use (MlpPolicy, CnnPolicy, ...) # policy: Union[str, Type[ActorCriticPolicy]] # # The environment to learn from (if registered in Gym, can be str) # env: Union[GymEnv, str] # The learning rate, it can be a function of the current progress remaining # (from 1 to 0) learning_rate: float = log_uniform(1e-6, 1e-2, default=3e-4) # The number of steps to run for each environment per update (i.e. batch size # is n_steps * n_env where n_env is number of environment copies running in # parallel) # TODO: Limit this, as is done in A2C, based on the value of setting.max steps. n_steps: int = categorical(32, 128, 256, 1024, 2048, 4096, 8192, default=2048) # Minibatch size batch_size: Optional[int] = categorical(16, 32, 64, 128, default=64) # Number of epoch when optimizing the surrogate loss n_epochs: int = 10 # Discount factor gamma: float = uniform(0.9, 0.9999, default=0.99) # Factor for trade-off of bias vs variance for Generalized Advantage Estimator gae_lambda: float = uniform(0.8, 1.0, default=0.95) # Clipping parameter, it can be a function of the current progress remaining # (from 1 to 0). clip_range: float = uniform(0.05, 0.4, default=0.2) # Clipping parameter for the value function, it can be a function of the current # progress remaining (from 1 to 0). This is a parameter specific to the OpenAI # implementation. If None is passed (default), no clipping will be done on the # value function. IMPORTANT: this clipping depends on the reward scaling. clip_range_vf: Optional[float] = None # Entropy coefficient for the loss calculation ent_coef: float = uniform(0., 1., default=0.0) # Value function coefficient for the loss calculation vf_coef: float = uniform(0.01, 1.0, default=0.5) # The maximum value for the gradient clipping max_grad_norm: float = uniform(0.1, 10, default=0.5) # Whether to use generalized State Dependent Exploration (gSDE) instead of # action noise exploration (default: False) use_sde: bool = categorical(True, False, default=False) # Sample a new noise matrix every n steps when using gSDE Default: -1 (only # sample at the beginning of the rollout) sde_sample_freq: int = categorical(-1, 1, 5, 10, default=-1) # Limit the KL divergence between updates, because the clipping is not enough to # prevent large update see issue #213 # (cf https://github.com/hill-a/stable-baselines/issues/213) # By default, there is no limit on the kl div. target_kl: Optional[float] = None # the log location for tensorboard (if None, no logging) tensorboard_log: Optional[str] = None # # Whether to create a second environment that will be used for evaluating the # # agent periodically. (Only available when passing string for the environment) # create_eval_env: bool = False # # Additional arguments to be passed to the policy on creation # policy_kwargs: Optional[Dict[str, Any]] = None # The verbosity level: 0 no output, 1 info, 2 debug verbose: int = 1 # Seed for the pseudo random generators seed: Optional[int] = None # Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, # the code will be run on the GPU if possible. device: Union[torch.device, str] = "auto"
class HParams(SB3BaseHParams): """ Hyper-parameters common to all on-policy algos from SB3. """ # learning rate for the optimizer, it can be a function of the current # progress remaining (from 1 to 0) learning_rate: Union[float, Callable] = log_uniform(1e-7, 1e-2, default=1e-3) # The number of steps to run for each environment per update (i.e. batch size # is n_steps * n_env where n_env is number of environment copies running in # parallel) # NOTE: Default value here is much lower than in PPO, which might indicate # that this A2C is more "on-policy"? (i.e. that it requires data to be super # "fresh")? n_steps: int = uniform(3, 64, default=5, discrete=True) # Discount factor gamma: float = 0.99 # gamma: float = uniform(0.9, 0.9999, default=0.99) # Factor for trade-off of bias vs variance for Generalized Advantage Estimator. # Equivalent to classic advantage when set to 1. gae_lambda: float = 1.0 # gae_lambda: float = uniform(0.5, 1.0, default=1.0) # Entropy coefficient for the loss calculation ent_coef: float = 0.0 # ent_coef: float = uniform(0.0, 1.0, default=0.0) # Value function coefficient for the loss calculation vf_coef: float = 0.5 # vf_coef: float = uniform(0.01, 1.0, default=0.5) # The maximum value for the gradient clipping max_grad_norm: float = 0.5 # max_grad_norm: float = uniform(0.1, 10, default=0.5) # Whether to use generalized State Dependent Exploration (gSDE) instead of # action noise exploration (default: False) use_sde: bool = False # use_sde: bool = categorical(True, False, default=False) # Sample a new noise matrix every n steps when using gSDE. # Default: -1 (only sample at the beginning of the rollout) sde_sample_freq: int = -1 # sde_sample_freq: int = categorical(-1, 1, 5, 10, default=-1) # The log location for tensorboard (if None, no logging) tensorboard_log: Optional[str] = None # # Whether to create a second environment that will be used for evaluating the # # agent periodically. (Only available when passing string for the environment) # create_eval_env: bool = False # # Additional arguments to be passed to the policy on creation # policy_kwargs: Optional[Dict[str, Any]] = None # The verbosity level: 0 no output, 1 info, 2 debug verbose: int = 1 # Seed for the pseudo random generators seed: Optional[int] = None # Device (cpu, cuda, ...) on which the code should be run. # Setting it to auto, the code will be run on the GPU if possible. device: Union[torch.device, str] = "auto"
class HParams(SB3BaseHParams): """ Hyper-parameters of the DQN model from `stable_baselines3`. The command-line arguments for these are created with simple-parsing. """ # The learning rate, it can be a function of the current progress (from # 1 to 0) learning_rate: Union[float, Callable] = log_uniform(1e-6, 1e-2, default=1e-4) # size of the replay buffer buffer_size: int = uniform(100, 10_000_000, default=1_000_000) # How many steps of the model to collect transitions for before learning # starts. learning_starts: int = uniform(1_000, 100_000, default=50_000) # Minibatch size for each gradient update batch_size: Optional[int] = categorical(1, 2, 4, 8, 16, 32, 128, default=32) # The soft update coefficient ("Polyak update", between 0 and 1) default # 1 for hard update tau: float = uniform(0., 1., default=1.0) # The discount factor gamma: float = uniform(0.9, 0.9999, default=0.99) # Update the model every ``train_freq`` steps. Set to `-1` to disable. train_freq: int = uniform(1, 100, default=4) # How many gradient steps to do after each rollout (see ``train_freq`` # and ``n_episodes_rollout``) Set to ``-1`` means to do as many gradient # steps as steps done in the environment during the rollout. gradient_steps: int = categorical(1, -1, default=1) # Enable a memory efficient variant of the replay buffer at a cost of # more complexity. # See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195 optimize_memory_usage: bool = False # Update the target network every ``target_update_interval`` environment # steps. target_update_interval: int = categorical(1_000, 5_000, 10_000, 50_000, default=10_000) # Fraction of entire training period over which the exploration rate is # reduced. exploration_fraction: float = uniform(0.05, 0.3, default=0.1) # Initial value of random action probability. exploration_initial_eps: float = uniform(0.5, 1.0, default=1.0) # final value of random action probability. exploration_final_eps: float = uniform(0, 0.1, default=0.05) # The maximum value for the gradient clipping. max_grad_norm: float = uniform(1, 100, default=10) # Whether to create a second environment that will be used for # evaluating the agent periodically. (Only available when passing string # for the environment) create_eval_env: bool = False # Whether or not to build the network at the creation # of the instance _init_setup_model: bool = True
class TrainerConfig(HyperParameters): """ Configuration options for the pytorch-lightning Trainer. TODO: Pytorch Lightning already has a mechanism for adding argparse arguments for the Trainer.. Would there be a better way of merging the simple-parsing and pytorch-lightning approaches ? """ gpus: int = torch.cuda.device_count() overfit_batches: float = 0.0 fast_dev_run: bool = field(default=False, nargs=0, action="store_true") # Maximum number of epochs to train for. max_epochs: int = uniform(1, 100, default=10) # Number of nodes to use. num_nodes: int = 1 distributed_backend: Optional[str] = "dp" if gpus != 0 else None log_gpu_memory: bool = False val_check_interval: Union[int, float] = 1.0 auto_scale_batch_size: Optional[str] = None auto_lr_find: bool = False # Floating point precision to use in the model. (See pl.Trainer) precision: int = choice(16, 32, default=32) default_root_dir: Path = Path(os.getcwd()) / "results" # How much of training dataset to check (floats = percent, int = num_batches) limit_train_batches: Union[int, float] = 1.0 # How much of validation dataset to check (floats = percent, int = num_batches) limit_val_batches: Union[int, float] = 1.0 # How much of test dataset to check (floats = percent, int = num_batches) limit_test_batches: Union[int, float] = 1.0 def make_trainer( self, config: Config, callbacks: Optional[List[Callback]] = None, loggers: Iterable[LightningLoggerBase] = None, ) -> Trainer: """ Create a Trainer object from the command-line args. Adds the given loggers and callbacks as well. """ return Trainer( logger=loggers, callbacks=callbacks, gpus=self.gpus, num_nodes=self.num_nodes, max_epochs=self.max_epochs, distributed_backend=self.distributed_backend, log_gpu_memory=self.log_gpu_memory, overfit_batches=self.overfit_batches, fast_dev_run=self.fast_dev_run, auto_scale_batch_size=self.auto_scale_batch_size, auto_lr_find=self.auto_lr_find, # TODO: Either move the log-dir-related stuff from Config to this # class, or figure out a way to pass the value from Config to this # function default_root_dir=self.default_root_dir, limit_train_batches=self.limit_train_batches, limit_val_batches=self.limit_val_batches, limit_test_batches=self.limit_train_batches, )
class Options(HyperParameters): """Settings for this Auxiliary Task. """ # Coefficient used to scale the task loss before adding it to the total. coefficient: float = uniform(0., 1., default=1.)