Ejemplo n.º 1
0
 class Config(BaseModel.Config):
     model: str = "mmbt"
     # classification or pretraining
     training_head_type: str = "pretraining"
     bert_model_name: str = "bert-base-uncased"
     direct_features_input: bool = False
     freeze_text: bool = False
     freeze_modal: bool = False
     freeze_complete_base: bool = False
     finetune_lr_multiplier: float = 1
     # Dimension of the embedding finally returned by the modal encoder
     modal_hidden_size: int = 2048
     text_hidden_size: int = 768
     num_labels: int = 2
     # This actually is Union[ImageEncoderConfig, ImageFeatureEncoderConfig]
     modal_encoder: EncoderFactory.Config = ImageEncoderFactory.Config(
         type=ImageEncoderTypes.resnet152,
         params=ResNet152ImageEncoder.Config())
     text_encoder: EncoderFactory.Config = TextEncoderFactory.Config(
         type=TextEncoderTypes.transformer,
         params=TransformerEncoder.Config(
             bert_model_name=II("bert_model_name")),
     )
     use_modal_start_token: bool = True
     use_modal_end_token: bool = True
     fused_feature_only: bool = False
     output_dim: int = 768
Ejemplo n.º 2
0
class Interpolation:
    x: int = 100
    y: int = 200
    # The real type of y is int, cast the interpolation string
    # to help static type checkers to see this truth
    z1: int = II("x")
    z2: str = SI("${x}_${y}")
Ejemplo n.º 3
0
class FairseqBMUFConfig(FairseqDataclass):
    block_lr: float = field(default=1,
                            metadata={"help": "block learning rate for bmuf"})
    block_momentum: float = field(default=0.875,
                                  metadata={"help": "block momentum for bmuf"})
    global_sync_iter: int = field(
        default=50, metadata={"help": "Iteration for syncing global model"})
    warmup_iterations: int = field(
        default=500,
        metadata={"help": "warmup iterations for model to broadcast"})
    use_nbm: bool = field(
        default=False,
        metadata={
            "help":
            "Specify whether you want to use classical BM / Nesterov BM"
        },
    )
    average_sync: bool = field(
        default=False,
        metadata={
            "help":
            "Specify whether you want to average the local momentum after each sync"
        },
    )
    distributed_world_size: int = II(
        "distributed_training.distributed_world_size")
Ejemplo n.º 4
0
class Data2VecTextConfig(FairseqDataclass):
    max_positions: int = II("task.tokens_per_sample")

    head_layers: int = 1

    transformer: TransformerConfig = TransformerConfig()

    load_checkpoint_heads: bool = field(
        default=False,
        metadata={
            "help": "(re-)register and load heads when loading checkpoints"
        },
    )

    loss_beta: float = field(
        default=0,
        metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
    loss_scale: Optional[float] = field(
        default=None,
        metadata={
            "help":
            "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
        },
    )
    average_top_k_layers: int = field(
        default=8, metadata={"help": "how many layers to average"})

    layer_norm_target_layer: bool = False
    instance_norm_target_layer: bool = False
    batch_norm_target_layer: bool = False
    instance_norm_targets: bool = False
    layer_norm_targets: bool = False

    ema_decay: float = field(default=0.999,
                             metadata={"help": "initial ema decay rate"})
    ema_end_decay: float = field(default=0.9999,
                                 metadata={"help": "final ema decay rate"})

    # when to finish annealing ema decay rate
    ema_anneal_end_step: int = II("optimization.max_update")

    ema_transformer_layers_only: bool = field(
        default=True,
        metadata={
            "help": "whether to momentum update only the transformer layers"
        },
    )
Ejemplo n.º 5
0
class FairseqAdamConfig(FairseqDataclass):
    adam_betas: Any = field(default=(0.9, 0.999),
                            metadata={"help": "betas for Adam optimizer"})
    adam_eps: float = field(default=1e-8,
                            metadata={"help": "epsilon for Adam optimizer"})
    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
    use_old_adam: bool = field(
        default=False, metadata={"help": "Use fairseq.optim.adam.Adam"})
    use_fused_adam: bool = field(
        default=False,
        metadata={
            "help": "Use habana_frameworks.torch.hpex.optimizers.FusedAdamW"
        })
    # TODO common vars below in parent
    tpu: bool = II("common.tpu")
    lr: List[float] = II("optimization.lr")
    use_habana: bool = II("common.use_habana")
Ejemplo n.º 6
0
class SentencePredictionConfig(FairseqDataclass):
    data: str = field(default=MISSING,
                      metadata={"help": "path to data directory"})
    num_classes: int = field(
        default=-1,
        metadata={"help": "number of classes or regression targets"},
    )
    init_token: Optional[int] = field(
        default=None,
        metadata={"help": "add token at the beginning of each batch item"},
    )
    separator_token: Optional[int] = field(
        default=None,
        metadata={"help": "add separator token between inputs"},
    )
    no_shuffle: bool = field(default=False, )
    shorten_method: SHORTEN_METHOD_CHOICES = field(
        default="none",
        metadata={
            "help":
            "if not none, shorten sequences that exceed tokens_per_sample"
        },
    )
    shorten_data_split_list: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of dataset splits to apply shortening to, "
            'e.g., "train,valid" (default: all dataset splits)'
        },
    )
    add_prev_output_tokens: bool = field(
        default=False,
        metadata={
            "help":
            "add prev_output_tokens to sample, used for encoder-decoder arch"
        },
    )
    max_positions: int = field(
        default=512,
        metadata={"help": "max tokens per example"},
    )

    regression_target: bool = II("criterion.regression_target")
    classification_head_name: str = II("criterion.classification_head_name")
    seed: int = II("common.seed")
Ejemplo n.º 7
0
class TruncatedBPTTLMConfig(FairseqDataclass):
    data: str = field(default="???",
                      metadata={"help": "path to data directory"})
    tokens_per_sample: int = field(
        default=1024,
        metadata={"help": "max number of tokens per sequence"},
    )
    batch_size: int = II("dataset.batch_size")
    # Some models use *max_target_positions* to know how many positional
    # embeddings to learn. We use II(...) to make it default to
    # *tokens_per_sample*, but in principle there could be more positional
    # embeddings than tokens in a single batch. This may also be irrelevant for
    # custom model implementations.
    max_target_positions: int = II("task.tokens_per_sample")
    # these will be populated automatically if not provided
    data_parallel_rank: Optional[int] = None
    data_parallel_size: Optional[int] = None
Ejemplo n.º 8
0
class ReduceLROnPlateauV2Config(FairseqDataclass):
    lr_shrink: float = field(
        default=0.1,
        metadata={
            "help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"
        },
    )
    lr_threshold: float = field(
        default=1e-4,
        metadata={
            "help":
            "threshold for measuring the new optimum, to only focus on significant changes"
        },
    )
    lr_patience: int = field(
        default=0,
        metadata={
            "help":
            "number of epochs with no improvement after which learning rate will be reduced"
        },
    )
    warmup_updates: int = field(
        default=0,
        metadata={
            "help": "warmup the learning rate linearly for the first N updates"
        },
    )
    warmup_init_lr: float = field(
        default=-1,
        metadata={
            "help":
            "initial learning rate during warmup phase; default is cfg.lr"
        },
    )
    final_lr_scale: float = field(
        default=0.01,
        metadata={"help": "final learning rate scale; default to 0.01"},
    )
    start_reduce_lr_epoch: int = field(
        default=0,
        metadata={"help": "start to reduce lr from the specified epoch"},
    )
    # TODO common vars at parent class
    lr: List[float] = II("optimization.lr")
    maximize_best_checkpoint_metric: bool = II(
        "checkpoint.maximize_best_checkpoint_metric")
Ejemplo n.º 9
0
class NumLanguageModelingConfig(LanguageModelingConfig):
    numlm_data_send_log_value: bool = field(
        default=False,
        metadata={
            "help":
            "prepare log value of the number tokens for number rotation embedding."
        })
    fp16: bool = II("common.fp16")
Ejemplo n.º 10
0
class TuneConfig:
    metric: str = "loss"
    mode: str = "min"
    num_samples: int = 1
    name: Optional[str] = II("trainer.name")
    checkpoint_freq: int = 100
    checkpoint_at_end: bool = True
    additional_config: Dict[str, Any] = field(default_factory=lambda: {})
Ejemplo n.º 11
0
class AudioPretrainingConfig(FairseqDataclass):
    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "extension of the label file to load, used for fine-tuning"},
    )
    binarized_dataset: bool = field(
        default=False,
        metadata={
            "help": "if true, loads binarized dataset (useful for very large datasets). "
            "See examples/wav2vec/scripts/binarize_manifest.sh"
        },
    )
    sample_rate: int = field(
        default=16_000,
        metadata={
            "help": "target sample rate. audio files will be up/down sampled to this rate"
        },
    )
    normalize: bool = field(
        default=False,
        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
    )
    enable_padding: bool = field(
        default=False, metadata={"help": "pad shorter samples instead of cropping"}
    )
    max_sample_size: Optional[int] = field(
        default=None, metadata={"help": "max sample size to crop to for batching"}
    )
    min_sample_size: Optional[int] = field(
        default=None, metadata={"help": "min sample size to skip small examples"}
    )
    num_batch_buckets: int = field(
        default=0,
        metadata={"help": "number of buckets"},
    )
    precompute_mask_indices: bool = field(
        default=False,
        metadata={
            "help": "flag to compute mask indices in data preparation.",
        },
    )

    inferred_w2v_config: Optional[InferredW2vConfig] = field(
        default=None,
        metadata={
            "help": "wav2vec 2.0 masking arguments used to pre-compute masks (required for TPU)",
        },
    )

    tpu: bool = II("common.tpu")
    text_compression_level: ChoiceEnum([x.name for x in TextCompressionLevel]) = field(
        default="none",
        metadata={
            "help": "compression level for texts (e.g. audio filenames, "
            "target texts): none/low/high (default: none). "
        },
    )
Ejemplo n.º 12
0
class SpeechToTextModTaskConfig(SpeechToTextTaskConfig):
    sample_ratios: str = field(
        default="1",
        metadata={"help": "sample ratios of the train subsets"}
    )

    da_p_augm: float = field(
        default="1",
        metadata={"help": "The probability that data augmentation is applied to an example."}
    )

    da_tempo: str = field(
        default="1,1",
        metadata={"help": "The range from which to sample the tempo factor during data augmentation"}
    )

    da_pitch: str = field(
        default="0,0",
        metadata={"help": "The range from which to sample the pitch value during data augmentation. \
            Measured in cents (i.e. 100ths of a semitone)"}
    )

    da_echo_delay: str = field(
        default="0,0",
        metadata={"help": "The range from which to sample the echo delay value during data augmentation. \
            Measured in milliseconds"}
    )

    da_echo_decay: str = field(
        default="0,0",
        metadata={"help": "The range from which to sample the echo decay factor during data augmentation."}
    )

    normalize: bool = field(
        default=True,
        metadata={"help": "Whether to normalize the audiowave to zero mean and unit variance."}
    )

    interactive_tgt_lang: Optional[str] = field(
        default=None,
        metadata={"help": "Target language to be used with Fairseq's interactive mode."}
    )

    seed: int = II("common.seed")
    max_tokens: int = II("dataset.max_tokens")
Ejemplo n.º 13
0
class SpeechRecognitionEspressoConfig(FairseqDataclass):
    data: Optional[str] = field(
        default=None, metadata={"help": "path to data directory"}
    )
    dict: Optional[str] = field(default=None, metadata={"help": "path to the dictionary"})
    non_lang_syms: Optional[str] = field(
        default=None,
        metadata={
            "help": "path to a file listing non-linguistic symbols, e.g., <NOISE> "
            "etc. One entry per line. To be filtered out when calculating WER/CER"
        },
    )
    word_dict: Optional[str] = field(
        default=None,
        metadata={"help": "path to the word dictionary. Only relevant for decoding"},
    )
    wer_output_filter: Optional[str] = field(
        default=None,
        metadata={"help": "path to wer_output_filter file for WER evaluation"},
    )
    max_source_positions: Optional[int] = field(
        default=1024, metadata={"help": "max number of tokens in the source sequence"}
    )
    max_target_positions: Optional[int] = field(
        default=1024, metadata={"help": "max number of tokens in the target sequence"}
    )
    upsample_primary: int = field(
        default=1, metadata={"help": "amount to upsample primary dataset"},
    )
    num_batch_buckets: Optional[int] = field(
        default=0,
        metadata={
            "help": "if >0, then bucket source and target lengths into N "
            "buckets and pad accordingly; this is useful on TPUs "
            "to minimize the number of compilations"
        },
    )
    feat_in_channels: int = field(default=1, metadata={"help": "feature input channels"})
    specaugment_config: Optional[str] = field(
        default=None,
        metadata={
            "help": "SpecAugment config string. If not None and not empty, "
            "then apply SpecAugment. Should be an evaluatable expression of "
            "a python dict. See speech_tools.specaug_interpolate.specaug() for "
            "all allowed arguments. Argments not appearing in this string "
            "will take on their default values"
        },
    )
    global_cmvn_stats_path: Optional[str] = field(
        default=None,
        metadata={"help": "If not None, apply global cmvn using this global cmvn stats file (.npz)."},
    )
    # TODO common vars below add to parent
    seed: int = II("common.seed")
    data_buffer_size: int = II("dataset.data_buffer_size")
    tpu: bool = II("common.tpu")
    train_subset: str = II("dataset.train_subset")
    valid_subset: str = II("dataset.valid_subset")
    gen_subset: str = II("dataset.gen_subset")
    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")
Ejemplo n.º 14
0
class TransducerLossCriterionConfig(FairseqDataclass):
    sentence_avg: bool = II("optimization.sentence_avg")
    print_training_sample_interval: int = field(
        default=500,
        metadata={
            "help":
            "print a training sample (reference + prediction) every this number of updates"
        },
    )
Ejemplo n.º 15
0
class PolynomialDecayScheduleConfig(FairseqDataclass):
    warmup_updates: int = field(
        default=0,
        metadata={"help": "warmup the learning rate linearly for the first N updates"},
    )
    force_anneal: Optional[int] = field(
        default=None,
        metadata={"help": "force annealing at specified epoch"},
    )
    end_learning_rate: float = field(
        default=0.0,
        metadata={"help": "learning rate to decay to"},
    )
    power: float = field(
        default=1.0,
        metadata={"help": "decay exponent"},
    )
    total_num_update: float = II("optimization.max_update")
    lr: List[float] = II("optimization.lr")
Ejemplo n.º 16
0
class HubertSeq2SeqConfig(HubertAsrConfig):
    decoder_embed_dim: int = field(
        default=768, metadata={"help": "decoder embedding dimension"})
    decoder_ffn_embed_dim: int = field(
        default=3072, metadata={"help": "decoder embedding dimension for FFN"})
    decoder_layers: int = field(default=6,
                                metadata={"help": "num of decoder layers"})
    decoder_layerdrop: float = field(
        default=0.0, metadata={"help": "decoder layerdrop chance"})
    decoder_attention_heads: int = field(
        default=4, metadata={"help": "num decoder attention heads"})
    decoder_learned_pos: bool = field(
        default=False,
        metadata={"help": "use learned positional embeddings in the decoder"},
    )
    decoder_normalize_before: bool = field(
        default=False,
        metadata={"help": "apply layernorm before each decoder block"})
    no_token_positional_embeddings: bool = field(
        default=False,
        metadata={
            "help":
            "if set, disables positional embeddings (outside self attention)"
        },
    )
    decoder_dropout: float = field(
        default=0.0, metadata={"help": "dropout probability in the decoder"})
    decoder_attention_dropout: float = field(
        default=0.0,
        metadata={
            "help":
            "dropout probability for attention weights inside the decoder"
        },
    )
    decoder_activation_dropout: float = field(
        default=0.0,
        metadata={
            "help":
            "dropout probability after activation in FFN inside the decoder"
        },
    )
    max_target_positions: int = field(
        default=2048, metadata={"help": "max target positions"})
    share_decoder_input_output_embed: bool = field(
        default=False,
        metadata={"help": "share decoder input and output embeddings"})
    autoregressive: bool = II("task.autoregressive")
    seq2seq_path: str = field(
        default="",
        metadata={"help": "reset_dict"},
    )
    reset_dict: bool = field(
        default=False,
        metadata={"help": "reset_dict"},
    )
Ejemplo n.º 17
0
class CtcCriterionConfig(FairseqDataclass):
    zero_infinity: bool = field(
        default=True,
        metadata={
            "help":
            "zero inf loss when source length <= target length. Should be set for CTC NAT since we have no idea if this condition holds."
        },
    )
    sentence_avg: bool = II("optimization.sentence_avg")

    cutoff: bool = field(
        default=False,
        metadata={"help": "Apply cutoff data augmentation."},
    )
    cutoff_regularization_loss: float = field(
        default=5.0,
        metadata={"help": "Cutoff regularization coefficient."},
    )

    post_process: str = field(
        default="letter",
        metadata={
            "help":
            "how to post process predictions into words. can be letter, "
            "wordpiece, BPE symbols, etc. "
            "See fairseq.data.data_utils.post_process() for full list of options"
        },
    )
    wer_kenlm_model: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "if this is provided, use kenlm to compute wer (along with other wer_* args)"
        },
    )
    wer_lexicon: Optional[str] = field(
        default=None,
        metadata={"help": "lexicon to use with wer_kenlm_model"},
    )
    wer_lm_weight: float = field(
        default=2.0,
        metadata={"help": "lm weight to use with wer_kenlm_model"},
    )
    wer_word_score: float = field(
        default=-1.0,
        metadata={"help": "lm word score to use with wer_kenlm_model"},
    )

    wer_args: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "DEPRECATED: tuple of (wer_kenlm_model, wer_lexicon, wer_lm_weight, wer_word_score)"
        },
    )
Ejemplo n.º 18
0
class VoxelCADatasetConfig(BaseDatasetConfig):
    _target_: str = "artefact_nca.dataset.voxel_dataset.VoxelDataset"
    entity_name: Optional[str] = None
    target_voxel: Optional[Any] = None
    target_color_dict: Optional[Dict[Any, Any]] = dataclasses.field(
        default_factory=lambda: None)
    target_unique_val_dict: Optional[Dict[Any, Any]] = dataclasses.field(
        default_factory=lambda: None)
    nbt_path: Optional[str] = None
    load_coord: List[int] = dataclasses.field(
        default_factory=lambda: [50, 10, 10])
    load_entity_config: Dict[Any, Any] = dataclasses.field(
        default_factory=lambda: {})
    spawn_at_bottom: bool = False
    use_random_seed_block: bool = False
    input_shape: Optional[List[int]] = None
    num_hidden_channels: Any = II("trainer.num_hidden_channels")
    half_precision: Any = II("trainer.half_precision")
    pool_size: int = 32
    padding_by_power: Optional[int] = None
Ejemplo n.º 19
0
class LatticeFreeMMICriterionConfig(FairseqDataclass):
    sentence_avg: bool = II("params.optimization.sentence_avg")
    ddp_backend: DDP_BACKEND_CHOICES = II(
        "params.distributed_training.ddp_backend")
    denominator_fst_path: str = field(
        default=None, metadata={"help": "path to the denominator fst file"})
    leaky_hmm_coefficient: float = field(
        default=1.0e-05,
        metadata={"help": "leaky-hmm coefficient for the denominator"},
    )
    xent_regularization_coefficient: float = field(
        default=0.0,
        metadata={"help": "cross-entropy regularization coefficient"},
    )
    output_l2_regularization_coefficient: float = field(
        default=0.0,
        metadata={
            "help": "L2 regularization coefficient for the network's output"
        },
    )
Ejemplo n.º 20
0
class FairseqCPUAdamConfig(FairseqDataclass):
    adam_betas: str = field(default="(0.9, 0.999)",
                            metadata={"help": "betas for Adam optimizer"})
    adam_eps: float = field(default=1e-8,
                            metadata={"help": "epsilon for Adam optimizer"})
    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
    fp16_adam_stats: bool = field(
        default=False,
        metadata={"help": "use FP16 stats (with automatic scaling)"})
    # TODO common vars below in parent
    lr: List[float] = II("optimization.lr")
Ejemplo n.º 21
0
class BoolConfig:
    # with default value
    with_default: bool = True

    # default is None
    null_default: Optional[bool] = None

    # explicit no default
    mandatory_missing: bool = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: bool = II("with_default")
Ejemplo n.º 22
0
class EnumConfig:
    # with default value
    with_default: Color = Color.BLUE

    # default is None
    null_default: Optional[Color] = None

    # explicit no default
    mandatory_missing: Color = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: Color = II("with_default")
Ejemplo n.º 23
0
class FloatConfig:
    # with default value
    with_default: float = 0.10

    # default is None
    null_default: Optional[float] = None

    # explicit no default
    mandatory_missing: float = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: float = II("with_default")
Ejemplo n.º 24
0
class StringConfig:
    # with default value
    with_default: str = "foo"

    # default is None
    null_default: Optional[str] = None

    # explicit no default
    mandatory_missing: str = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: str = II("with_default")
Ejemplo n.º 25
0
class IntegersConfig:
    # with default value
    with_default: int = 10

    # default is None
    null_default: Optional[int] = None

    # explicit no default
    mandatory_missing: int = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: int = II("with_default")
Ejemplo n.º 26
0
class BytesConfig:
    # with default value
    with_default: bytes = b"binary"

    # default is None
    null_default: Optional[bytes] = None

    # explicit no default
    mandatory_missing: bytes = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: bytes = II("with_default")
Ejemplo n.º 27
0
class PathConfig:
    # with default value
    with_default: Path = Path("hello.txt")

    # default is None
    null_default: Optional[Path] = None

    # explicit no default
    mandatory_missing: Path = MISSING

    # interpolation, will inherit the type and value of `with_default'
    interpolation: Path = II("with_default")
Ejemplo n.º 28
0
class LSTMLanguageModelEspressoConfig(FairseqDataclass):
    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
    decoder_embed_dim: int = field(
        default=48, metadata={"help": "decoder embedding dimension"}
    )
    decoder_embed_path: Optional[str] = field(
        default=None, metadata={"help": "path to pre-trained decoder embedding"}
    )
    decoder_freeze_embed: bool = field(
        default=False, metadata={"help": "freeze decoder embeddings"}
    )
    decoder_hidden_size: int = field(
        default=650, metadata={"help": "decoder hidden size"}
    )
    decoder_layers: int = field(
        default=2, metadata={"help": "number of decoder layers"}
    )
    decoder_out_embed_dim: int = field(
        default=650, metadata={"help": "decoder output embedding dimension"}
    )
    decoder_rnn_residual: bool = field(
        default=False,
        metadata={
            "help": "create residual connections for rnn decoder layers "
            "(starting from the 2nd layer), i.e., the actual output of such "
            "layer is the sum of its input and output"
        },
    )
    adaptive_softmax_cutoff: Optional[str] = field(
        default=None,
        metadata={
            "help": "comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion"
        },
    )
    share_embed: bool = field(
        default=False, metadata={"help": "share input and output embeddings"}
    )
    is_wordlm: bool = field(
        default=False,
        metadata={
            "help": "whether it is word LM or subword LM. Only relevant for ASR decoding "
            "with LM, and it determines how the underlying decoder instance gets the "
            "dictionary from the task instance when calling cls.build_model()"
        },
    )
    # Granular dropout settings (if not specified these default to --dropout)
    decoder_dropout_in: Optional[float] = field(
        default=II("model.dropout"),
        metadata={"help": "dropout probability for decoder input embedding"},
    )
    decoder_dropout_out: Optional[float] = field(
        default=II("model.dropout"),
        metadata={"help": "dropout probability for decoder output"},
    )
    # options from other parts of the config
    tokens_per_sample: int = II("task.tokens_per_sample")
    max_target_positions: Optional[int] = II("task.max_target_positions")
    tpu: bool = II("common.tpu")
    criterion_name: Optional[str] = II("criterion._name")
class InverseSquareRootScheduleConfig(FairseqDataclass):
    warmup_updates: int = field(
        default=4000,
        metadata={"help": "warmup the learning rate linearly for the first N updates"},
    )
    warmup_init_lr: float = field(
        default=-1,
        metadata={
            "help": "initial learning rate during warmup phase; default is args.lr"
        },
    )
    # TODO common vars at parent class
    lr: List[float] = II("params.optimization.lr")
Ejemplo n.º 30
0
class Nested:
    # with default value
    with_default: int = 10

    # default is None
    null_default: Optional[int] = None

    # explicit no default
    mandatory_missing: int = MISSING

    # Note that since relative interpolations are not yet supported,
    # Nested configs and interpolations does not play too well together
    interpolation: int = II("value_at_root")