def dummy_config(model_arch):
    """Creates a dummy model config that can be used by all tests."""
    config = base_config.get_config()
    config.model_arch = model_arch.name
    config.d_emb = 2
    config.d_model = 2
    config.d_ff = 2
    config.max_seq_length = 4
    config.num_heads = 1
    config.num_layers = 2
    config.vocab_size = 16
    config.pad_id = 0
    config.train_batch_size = 3
    config.eval_batch_size = 2
    config.use_fft = True
    config.num_experts = 2
    config.num_moe_layers = 0
    config.num_attention_layers = 0
    config.max_group_size = 2
    config.auxiliary_loss_factor = 0.01
    config.router_z_loss_factor = 0.01
    config.dispatch_algorithm = DispatchAlgorithm.MASK_TOKENS_CHOOSE
    config.dtype = jnp.float32

    return config
 def test_validate_incorrect_configs(self):
     config = default_config.get_config()
     config.train_batch_size = 6
     config.gradient_accum_steps = 4
     with self.assertRaisesRegex(
             ValueError,
             "training batch size must be divisible by gradient_accum_steps"
     ):
         train_utils.validate_config(config)
def get_config():
    """Config for pre-training."""
    config = base_config.get_config()

    # Determines which model to use.
    config.model_arch: str = ModelArchitecture.LINEAR.name

    config.mode: TrainingMode = TrainingMode.PRETRAINING

    # Total batch size for training.
    config.train_batch_size: int = 64
    # Total batch size for eval.
    config.eval_batch_size: int = 64

    # The base learning rate for Adam.
    config.learning_rate: float = 1e-4

    # Number of training steps.
    config.num_train_steps: int = int(1e6)
    # Number of warm-up steps. We generally find that that larger models need more
    # warm-up steps.
    config.num_warmup_steps: int = int(1e4)

    # How often to save the model checkpoint.
    config.save_checkpoints_steps: int = 20000
    # Frequency fo eval during training, e.g. every 2000 steps.
    config.eval_frequency: int = 2000

    # Maximum number of eval steps.
    config.max_num_eval_steps: int = 1000

    # Do not start from a pre-trained checkpoint.
    config.init_checkpoint_dir: str = ""

    # Maximum number of masked LM predictions per sequence.
    config.max_predictions_per_seq: int = 80
    # Proportion of tokens for masked LM predictions. Total number of selected
    # tokens will be at most config.max_predictions_per_seq.
    config.masking_rate: float = 0.15
    # Proportion of masked tokens to replace with ['MASK'].
    config.mask_token_proportion: float = 0.8
    # Proportion of masked tokens to replace with a random token.
    config.random_token_proportion: float = 0.1
    # Remaining 1 - config.mask_token_proportion - config.random_token_proportion
    # fraction of selected tokens are left as is.

    # Measure the step speed.
    config.measure_step_speed: bool = False

    # Dummy attribute for repeated runs.
    config.trial: int = 0

    return config
Beispiel #4
0
def get_config():
    """Config for fine-tuning (classification)."""
    config = base_config.get_config()

    # Determines which model to use.
    config.model_arch: str = ModelArchitecture.LINEAR.name

    config.mode: TrainingMode = TrainingMode.CLASSIFICATION

    # Available fine-tuning tasks are "glue/DS_g", "super_glue/DS_sg",
    # where "DS_g" is one of the following:
    # [cola, sst2, mrpc, qqp, stsb, mnli, qnli, rte],
    # and "DS_sg" is one of the following:
    # [boolq, cb, copa, multirc, record, rte, wic].
    config.dataset_name: str = "glue/rte"

    # How often to save the model checkpoint.
    config.save_checkpoints_steps: int = 1000
    # Training metrics will be computed (1 / eval_proportion) times during
    # training at regularly spaced intervals, regardless of dataset size.
    config.eval_proportion: float = 0.1

    # Total batch size for training.
    config.train_batch_size: int = 64
    # Total batch size for eval (and predictions).
    config.eval_batch_size: int = 64

    # The base learning rate for Adam.
    config.learning_rate: float = 1e-5

    # Total number of training epochs to perform.
    config.num_train_epochs: int = 5
    # Proportion of training to perform linear learning rate warmup for.
    # E.g., 0.1 = 10% of training steps.
    config.warmup_proportion: float = 0.1

    # Maximum number of eval steps on validation split. Actual number of step may
    # be less for small eval datasets.
    config.max_num_eval_steps: int = int(1e5)

    # For fine-tuning Mixture of Experts models, it is often beneficial to have a
    # larger dropout rate for the individual experts.
    config.expert_dropout_rate: float = 0.2

    # Initial checkpoint directory or filepath (usually from a pre-trained model).
    config.init_checkpoint_dir: str = ""

    # Dummy attribute for repeated runs.
    config.trial: int = 0

    return config
def frozen_config(sharded_params=False):
    """Creates a dummy model config that can be used by all tests."""
    config = default_config.get_config()
    config.model_arch = default_config.ModelArchitecture.LINEAR.name
    config.num_attention_layers = 0
    config.d_emb = 4
    config.d_model = 4
    config.d_ff = 4
    config.max_seq_length = 8
    config.num_layers = 1
    config.vocab_size = 16
    config.train_batch_size = 4
    config.dtype = jnp.float32
    config.pad_id = 3
    # MoE layers contain sharded parameters.
    config.num_moe_layers = 1 if sharded_params else 0
    config.num_experts = 1 if sharded_params else 0
    config.auxiliary_loss_factor = 0.01
    config.router_z_loss_factor = 0.01
    return ml_collections.FrozenConfigDict(config)
 def test_validate_correct_config(self):
     config = default_config.get_config()
     train_utils.validate_config(config)