def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension (extra linear layer ' 'if different from decoder embed dim') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') parser.add_argument('--checkpoint-activations', action='store_true', help='checkpoint activations at each layer, which saves GPU ' 'memory usage at the cost of some additional compute') parser.add_argument('--offload-activations', action='store_true', help='checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations.') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument('--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0, help='scalar quantization noise and scalar quantization at training time') # args for Fully Sharded Data Parallel (FSDP) training parser.add_argument( '--min-params-to-wrap', type=int, metavar='D', default=DEFAULT_MIN_PARAMS_TO_WRAP, help=( 'minimum number of params for a layer to be wrapped with FSDP() when ' 'training with --ddp-backend=fully_sharded. Smaller values will ' 'improve memory efficiency, but may make torch.distributed ' 'communication less efficient due to smaller input sizes. This option ' 'is set to 0 (i.e., always wrap) when --checkpoint-activations or ' '--offload-activations are passed.' ) )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections')
def add_args(parser): """Add model-specific arguments to the parser.""" # input parser.add_argument( "--conv-kernel-sizes", type=str, metavar="N", help="kernel sizes of Conv1d subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d subsampling layers", ) # Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", )
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--num-segments", type=int, metavar="N", help="num segments", ) parser.add_argument( "--encoder-layers", type=int, metavar="L", help="num encoder layers", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="H", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="F", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="A", help="num encoder attention heads", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use for pooler layer", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN", ) parser.add_argument( "--pooler-dropout", type=float, metavar="D", help="dropout probability in the masked_lm pooler layers", ) parser.add_argument( "--max-positions", type=int, help="number of positional embeddings to learn", ) parser.add_argument( "--load-checkpoint-heads", action="store_true", help="(re-)register and load heads when loading checkpoints", ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument( "--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder", ) parser.add_argument( "--encoder-layers-to-keep", default=None, help= "which layers to *keep* when pruning as a comma-separated list", )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension (extra linear layer ' 'if different from decoder embed dim') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') parser.add_argument('--checkpoint-activations', action='store_true', help='checkpoint activations at each layer, which saves GPU ' 'memory usage at the cost of some additional compute') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument('--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0, help='scalar quantization noise and scalar quantization at training time') # fmt: on parser.add_argument( "--pretrained-roberta-checkpoint-folder", type=str, metavar="STR", help="roberta model to use for initializing transformer encoder", )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--no-decoder-final-norm', action='store_true', help='don\'t add an extra layernorm after the last decoder block') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--no-token-positional-embeddings', action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument( '--character-embeddings', action='store_true', help= 'if set, uses character embedding convolutions to produce token embeddings' ) parser.add_argument( '--character-filters', type=str, metavar='LIST', default= '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N', help='size of character embeddings') parser.add_argument( '--char-embedder-highway-layers', default=2, type=int, metavar='N', help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument( '--tie-adaptive-weights', action='store_true', help= 'if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument( '--tie-adaptive-proj', action='store_true', help= 'if set, ties the projection weights of adaptive softmax and adaptive input' ) parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--decoder-layerdrop', type=float, metavar='D', help='LayerDrop probability for decoder') parser.add_argument( '--decoder-layers-to-keep', help='which layers to *keep* when pruning as a comma-separated list' ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( '--quant-noise-pq', type=float, metavar='D', help='iterative PQ quantization noise at training time') parser.add_argument( '--quant-noise-pq-block-size', type=int, metavar='D', help='block size of quantization noise at training time') parser.add_argument( '--quant-noise-scalar', type=float, metavar='D', help= 'scalar quantization noise and scalar quantization at training time' )
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--encoder-layers', type=int, metavar='L', help='num encoder layers') parser.add_argument('--encoder-embed-dim', type=int, metavar='H', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-attention-heads', type=int, metavar='A', help='num encoder attention heads') parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use for pooler layer') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', type=float, metavar='D', help='dropout probability after activation in FFN') parser.add_argument( '--pooler-dropout', type=float, metavar='D', help='dropout probability in the masked_lm pooler layers') parser.add_argument('--max-positions', type=int, help='number of positional embeddings to learn') parser.add_argument( '--load-checkpoint-heads', action='store_true', help='(re-)register and load heads when loading checkpoints') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' )
def add_args(parser): """Add model-specific arguments to the parser.""" # wav2vec encoder Wav2VecEncoderWithAdaptor.add_args(parser) # add_decoder_args(parser) # mbart Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument("--mbart-dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--mbart-attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--mbart-activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-mbart-from", type=str, metavar="STR", help= "model to take text encoder decoder weights from (for initialization)", ) # parser.add_argument("--finetune-w2v-params", type=str, metavar="STR", # help="comma-separated param strings to finetune.") parser.add_argument( "--finetune-mbart-decoder-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) parser.add_argument( "--finetune-mbart-encoder-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) parser.add_argument( "--skip-encoder-projection", action="store_true", help="skip the projection layer in encoder", ) parser.add_argument( "--enc-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc1 and enc2 gradient by V", ) parser.add_argument( "--enc2-along-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc2 gradient by V if only enc2 is used", ) parser.add_argument( "--text-input-cost-ratio", type=float, default=1.0, metavar="V", help="text input cost ratio relative to speech input cost", ) parser.add_argument( "--stack-w2v-mbart-encoder", action="store_true", help="stack w2v and mbart encoder", ) parser.add_argument( "--stack-w2v-mbart-nonorm-encoder", action="store_true", help="stack w2v and mbart encoder", ) parser.add_argument("--no-final-norm-decoder", action="store_true", help="no layer norm") parser.add_argument( "--drop-w2v-layers", type=int, default=0, metavar="N", help="drop w2v encoder layers", ) parser.add_argument( "--share-w2v-text-encoder", action="store_true", help="share w2v encoder layers with text encoder", ) parser.add_argument( "--shared-w2v-layers", type=int, default=0, metavar="N", help="shared encoder layers from w2v encoder", )
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="encoder input dimension per input channel", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--decoder-output-dim", type=int, metavar="N", help="decoder output dimension (extra linear layer if different from decoder embed dim)", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--load-pretrained-decoder-from", type=str, metavar="STR", help="model to take decoder weights from (for initialization)", ) parser.add_argument( "--conv-out-channels", type=int, metavar="INT", help="the number of output channels of conv layer", )
def parse_args(parser): parser.add_argument("--data_dir", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--save_dir", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--data_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--test_data_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--feature_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--test_feature_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--world_size", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--gpu_size", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--valid_size", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--batch_size", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--log_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--field", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--model_file", type=str, help="local_rank for distributed training on gpus") parser.add_argument("--batch_t", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--iteration", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--epoch", type=int, default=1, help="local_rank for distributed training on gpus") parser.add_argument("--batch_one_epoch", type=int, help="local_rank for distributed training on gpus") parser.add_argument('--use_start_pos', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--from_epoch', action='store_true', help='apply layernorm before each encoder block') parser.add_argument("--all_batch_loss", type=float, help="local_rank for distributed training on gpus") # return parser.parse_args() # def parse_args_model(parser): parser.add_argument('--activation-fn', choices=fairseq_utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension (extra linear layer ' 'if different from decoder embed dim') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument('--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0, help='scalar quantization noise and scalar quantization at training time') return parser.parse_args()
class TransformerConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"}, ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"}) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN.", "alias": "--relu-dropout", }, ) adaptive_input: bool = False encoder: EncDecBaseConfig = EncDecBaseConfig() # TODO should really be in the encoder config max_source_positions: int = field( default=DEFAULT_MAX_SOURCE_POSITIONS, metadata={"help": "Maximum input length supported by the encoder"}, ) decoder: DecoderConfig = DecoderConfig() # TODO should really be in the decoder config max_target_positions: int = field( default=DEFAULT_MAX_TARGET_POSITIONS, metadata={"help": "Maximum output length supported by the decoder"}, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"}) share_all_embeddings: bool = field( default=False, metadata={ "help": "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)" }, ) merge_src_tgt_embed: bool = field( default=False, metadata={ "help": "if true then the source and target embedding table is " "merged into one table. This is going to make the model smaller but " "it might hurt performance." }) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if True, disables positional embeddings (outside self attention)" }, ) adaptive_softmax_cutoff: Optional[List[int]] = field( default=None, metadata={ "help": "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0.0, metadata={ "help": "sets adaptive softmax dropout for the tail projections" }, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"}) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"}) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"}) checkpoint_activations: bool = field( default=False, metadata={ "help": "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute" }, ) offload_activations: bool = field( default=False, metadata={ "help": "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations." }, ) # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) no_cross_attention: bool = field( default=False, metadata={"help": "do not perform cross-attention"}) cross_self_attention: bool = field( default=False, metadata={"help": "perform cross+self-attention"}) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ "help": "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." }, ) # DEPRECATED field, but some old checkpoints might have it char_inputs: bool = field( default=False, metadata={"help": "if set, model takes character ids as input"}) relu_dropout: float = 0.0 # config for "BASE Layers: Simplifying Training of Large, Sparse Models" base_layers: Optional[int] = field( default=0, metadata={"help": "number of BASE layers in total"}) base_sublayers: Optional[int] = field( default=1, metadata={"help": "number of sublayers in each BASE layer"}) base_shuffle: Optional[int] = field( default=1, metadata={ "help": "shuffle tokens between workers before computing assignment" }, ) export: bool = field( default=False, metadata={"help": "make the layernorm exportable with torchscript."}, ) # copied from transformer_lm but expected in transformer_decoder: no_decoder_final_norm: bool = field( default=False, metadata={ "help": "don't add an extra layernorm after the last decoder block" }, ) # We need to make this hierarchical dataclass like the flat namespace # __getattr__ and __setattr__ here allow backward compatibility # for subclasses of Transformer(Legacy) that depend on read/write on # the flat namespace. def __getattr__(self, name): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) return safe_getattr(sub, match[2]) raise AttributeError(f"invalid argument {name}.") def __setattr__(self, name, value): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) setattr(sub, match[2], value) else: super().__setattr__(name, value) @staticmethod def _copy_keys(args, cls, prefix, seen): """ copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim """ cfg = cls() for fld in fields(cls): # for all the fields in the DC, find the fields (e.g. embed_dim) # in the namespace with the prefix (e.g. decoder) # and set it on the dc. args_key = f"{prefix}_{fld.name}" if safe_hasattr(args, args_key): seen.add(args_key) setattr(cfg, fld.name, safe_getattr(args, args_key)) if safe_hasattr(args, fld.name): seen.add(fld.name) setattr(cfg, fld.name, safe_getattr(args, fld.name)) return cfg @classmethod def from_namespace(cls, args): if args is None: return None if not isinstance(args, cls): seen = set() config = cls() # currently, we can go generically from DC fields to args hierarchically # but we can't easily deconstruct a flat namespace to a hierarchical # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple # for now. for fld in fields(cls): # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()` if fld.name == "decoder": if safe_hasattr(args, "decoder"): # in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC seen.add("decoder") config.decoder = DecoderConfig(**args.decoder) else: config.decoder = cls._copy_keys( args, DecoderConfig, "decoder", seen) elif fld.name == "encoder": # same but for encoder if safe_hasattr(args, "encoder"): seen.add("encoder") config.encoder = EncDecBaseConfig(**args.encoder) else: config.encoder = cls._copy_keys( args, EncDecBaseConfig, "encoder", seen) elif fld.name == "quant_noise": # same but for quant_noise if safe_hasattr(args, "quant_noise"): seen.add("quant_noise") config.quant_noise = QuantNoiseConfig( **args.quant_noise) else: config.quant_noise = cls._copy_keys( args, QuantNoiseConfig, "quant_noise", seen) elif safe_hasattr(args, fld.name): # if it's not a structure field, it's just a normal field, copy it over seen.add(fld.name) setattr(config, fld.name, safe_getattr(args, fld.name)) # we got all the fields defined in the dataclass, but # the argparse namespace might have extra args for two reasons: # - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this # - some places expect args to be there but never define them args_dict = (args._asdict() if safe_hasattr(args, "_asdict") else vars(args) if safe_hasattr(args, "__dict__") else {} ) # namedtupled doesn't have __dict__ :-/ for key, value in args_dict.items(): if key not in seen: setattr(config, key, value) return config else: return args
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--decoder-final-norm', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--use_att', type=str, nargs='+', default=[ 'es', 'ds', 'dc', ], help='') parser.add_argument('--combine', type=int, default=0, help='0 as usual 1 combine residual') parser.add_argument('--kernel_size', type=int, default=0, help='do not set static kernel') parser.add_argument( '--attn_dynamic_type', type=int, default=0, help= '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use dynamic kernel ' ) parser.add_argument('--attn_cat_relu', type=int, default=0) parser.add_argument( '--attn_wide_kernels', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,15]") for wide and gate') parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1') parser.add_argument( '--dynamic_depth_kernels', type=lambda x: options.eval_str_list(x, int), help= 'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn' ) parser.add_argument('--dynamic_padding', type=int, default=0, help='padding before dynamic conv') parser.add_argument('--attn_dynamic_cat', type=int, default=1) parser.add_argument('--bm', type=int, default=0, help='whether to use transformer_bm') parser.add_argument('--bm_in_a', type=float, default=3, help='sqrt(6/(1+a)),-1 for xavier') parser.add_argument('--bm_out_a', type=float, default=0, help='sqrt(6/(1+a)), -1 for xavier') parser.add_argument('--bm_fc3', type=float, default=1, help='') parser.add_argument('--bm_fc4', type=float, default=1, help='') parser.add_argument('--input_dropout', type=float, default=0, help='') parser.add_argument('--init_method', type=str, default='km', help='xavier,km,xi,fixup') parser.add_argument('--lnv', type=str, default='origin', help='layernorm,adanorm')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use") parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution's out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution's kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution's strides") parser.add_argument("--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights") parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.") parser.add_argument("--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN") parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument("--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads") parser.add_argument("--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block") parser.add_argument( "--encoder-transformer-context", type=str, metavar="EXPR", help="left/right context for time-restricted self-attention; " "can be None or a tuple of two non-negative integers/None") # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument("--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder") parser.add_argument( "--encoder-layers-to-keep", default=None, help="which layers to *keep* when pruning as a comma-separated list" ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( "--quant-noise-pq", type=float, metavar="D", default=0, help="iterative PQ quantization noise at training time") parser.add_argument( "--quant-noise-pq-block-size", type=int, metavar="D", default=8, help="block size of quantization noise at training time") parser.add_argument( "--quant-noise-scalar", type=float, metavar="D", default=0, help= "scalar quantization noise and scalar quantization at training time" )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--load-from-pretrained-model', type=str, default=None, help='Load from pretrained model')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', default=0.1, type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-normalize-before', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--no-decoder-final-norm', default=False, action='store_true', help='don\'t add an extra layernorm after the last decoder block') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument( '--character-embeddings', default=False, action='store_true', help= 'if set, uses character embedding convolutions to produce token embeddings' ) parser.add_argument( '--character-filters', type=str, metavar='LIST', default= '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N', help='size of character embeddings') parser.add_argument( '--char-embedder-highway-layers', default=2, type=int, metavar='N', help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument( '--tie-adaptive-weights', action='store_true', help= 'if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument( '--tie-adaptive-proj', action='store_true', help= 'if set, ties the projection weights of adaptive softmax and adaptive input' ) parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder')
class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"}) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"}) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."}) relu_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."}) decoder_embed_dim: int = field( default=512, metadata={"help": "decoder embedding dimension"}) decoder_output_dim: int = field( default=512, metadata={"help": "decoder output dimension"}) decoder_input_dim: int = field( default=512, metadata={"help": "decoder input dimension"}) decoder_ffn_embed_dim: int = field( default=2048, metadata={"help": "decoder embedding dimension for FFN"}) decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) decoder_attention_heads: int = field( default=8, metadata={"help": "num decoder attention heads"}) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"}) no_decoder_final_norm: bool = field( default=False, metadata={ "help": "don't add an extra layernorm after the last decoder block" }, ) adaptive_softmax_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0, metadata={ "help": "sets adaptive softmax dropout for the tail projections" }, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"}) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"}) character_embeddings: bool = field( default=False, metadata={ "help": "if set, uses character embedding convolutions to produce token embeddings" }, ) character_filters: str = field( default= "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", metadata={"help": "size of character embeddings"}, ) character_embedding_dim: int = field( default=4, metadata={"help": "size of character embeddings"}) char_embedder_highway_layers: int = field( default=2, metadata={ "help": "number of highway layers for character token embeddder" }, ) adaptive_input: bool = field( default=False, metadata={"help": "if set, uses adaptive input"}) adaptive_input_factor: float = field( default=4, metadata={"help": "adaptive input factor"}) adaptive_input_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive input cutoff points." }, ) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"}) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"}) checkpoint_activations: bool = field( default=False, metadata={"help": "checkpoint activations at each layer"}) offload_activations: bool = field( default=False, metadata={ "help": "move checkpointed activations to CPU after they are used." }, ) # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"}) decoder_layers_to_keep: Optional[str] = field( default=None, metadata={ "help": "which layers to *keep* when pruning as a comma-separated list" }, ) # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) quant_noise_pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) # config for Fully Sharded Data Parallel (FSDP) training min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ "help": ("minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed.") }, ) # config for "BASE Layers: Simplifying Training of Large, Sparse Models" base_layers: Optional[int] = field( default=0, metadata={"help": "number of BASE layers in total"}) base_sublayers: Optional[int] = field( default=1, metadata={"help": "number of sublayers in each BASE layer"}) base_shuffle: Optional[int] = field( default=1, metadata={ "help": "shuffle tokens between workers before computing assignment" }, ) # NormFormer scale_fc: Optional[bool] = field( default=False, metadata={"help": "Insert LayerNorm between fully connected layers"}, ) scale_attn: Optional[bool] = field( default=False, metadata={"help": "Insert LayerNorm after attention"}) scale_heads: Optional[bool] = field( default=False, metadata={"help": "Learn a scale coefficient for each attention head"}, ) scale_resids: Optional[bool] = field( default=False, metadata={ "help": "Learn a scale coefficient for each residual connection" }, ) # xFormers arguments decoder_xformers_att_config: Optional[str] = field( default=None, metadata={ "help": "config for xFormers library attention, defined in xformers.components.attention.AttentionConfig", }, ) # options from other parts of the config add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("common.tpu")
def add_args(parser): """Add model-specific arguments to the parser.""" # encoder 1: S2TTransformerEncoder for speech parser.add_argument( "--conv-kernel-sizes", type=str, metavar="N", help="kernel sizes of Conv1d subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d subsampling layers", ) parser.add_argument( "--enc-output-dim", type=int, metavar="N", help=""" encoder output dimension, can be None. If specified, projecting the transformer output to the specified dimension""", ) # standard Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-text-embed-dim", type=int, metavar="N", help="encoder text embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) # non-standard transformer parameters parser.add_argument( "--speech-encoder-layers", type=int, metavar="N", help="num speech encoder layers", ) parser.add_argument( "--text-encoder-layers", type=int, metavar="N", help="num text encoder layers", ) parser.add_argument( "--encoder-shared-layers", type=int, metavar="N", help="num shared encoder layers", ) parser.add_argument( "--encoder-shared-layer-level", type=int, metavar="N", default=0, choices=[0, 1, 2], help= "share layer level 0: all share 1: all share with separate model 2: share weight but not bias and layernorm", ) parser.add_argument( "--decoder-shared-layer-level", default=0, choices=[0, 1, 2], type=int, metavar="N", help= "0: share everything; 1: share everything with different model 2: no share layer_norm and bias", ) ### parser.add_argument( "--text-input-cost-ratio", type=float, default=1.0, metavar="V", help="text input cost ratio relative to speech input cost", ) parser.add_argument( "--init-scale", type=float, default=1.0, metavar="V", help="scale the initial weight by given factor", ) parser.add_argument( "--enc-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc1 and enc2 gradient by V", ) parser.add_argument( "--enc2-along-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc2 gradient by V if only enc2 is used", ) parser.add_argument( "--load-pretrain-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained encoder """, ) parser.add_argument( "--load-pretrain-speech-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained speech encoder """, ) parser.add_argument( "--load-pretrain-text-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained text encoder """, ) parser.add_argument( "--load-pretrain-text-encoder-last", type=str, default="", metavar="EXPR", help=""" path to the pretrained text encoder """, ) parser.add_argument( "--load-pretrain-decoder", type=str, metavar="EXPR", default="", help=""" path to the pretrained encoder """, ) parser.add_argument( "--add-speech-eos", action="store_true", help="add eos token at the end of input feature", ) parser.add_argument( "--speech-encoder-adapter-type", type=str, metavar="EXPR", default="None", choices=["None", "Linear", "MLP"], help="add speech encoder adapter", )
def add_args(parser): """Add model-specific arguments to the parser.""" # Arguments related to dropout parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for" " attention weights", ) parser.add_argument( "--act-dropout", type=float, metavar="D", help="dropout probability after" " activation in FFN", ) # Arguments related to hidden states and self-attention parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument("--bias-kv", action="store_true", help="if set, adding a learnable bias kv") parser.add_argument("--zero-attn", action="store_true", help="if set, pads attn with zero") # Arguments related to input and output embeddings parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--share-encoder-input-output-embed", action="store_true", help="share encoder input" " and output embeddings", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--no-token-positional-embeddings", action="store_true", help="if set, disables positional embeddings" " (outside self attention)", ) parser.add_argument("--num-segment", type=int, metavar="N", help="num segment in the input") # Arguments related to sentence level prediction parser.add_argument( "--sentence-class-num", type=int, metavar="N", help="number of classes for sentence task", ) parser.add_argument( "--sent-loss", action="store_true", help="if set," " calculate sentence level predictions", ) # Arguments related to parameter initialization parser.add_argument( "--apply-bert-init", action="store_true", help="use custom param initialization for BERT", ) # misc params parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="Which activation function to use for pooler layer.", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off # TODO parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--decoder-output-dim', type=int, metavar='N', help='decoder output dimension (extra linear layer ' 'if different from decoder embed dim') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') parser.add_argument( '--checkpoint-activations', action='store_true', help='checkpoint activations at each layer, which saves GPU ' 'memory usage at the cost of some additional compute') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) parser.add_argument( '--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( '--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument( '--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument( '--quant-noise-scalar', type=float, metavar='D', default=0, help= 'scalar quantization noise and scalar quantization at training time' ) # for prime parser.add_argument('--use_att', type=str, nargs='+', default=[ 'es', 'ds', 'dc', ], help='') parser.add_argument('--kernel_size', type=int, default=0, help='do not set static kernel') parser.add_argument( '--attn_dynamic_type', type=int, default=0, help= '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use dynamic kernel ' ) parser.add_argument('--attn_cat_relu', type=int, default=0) parser.add_argument( '--attn_wide_kernels', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,15]") for wide and gate') parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1') parser.add_argument( '--dynamic_depth_kernels', type=lambda x: options.eval_str_list(x, int), help= 'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn' ) parser.add_argument('--dynamic_padding', type=int, default=0, help='padding before dynamic conv') parser.add_argument('--attn_dynamic_cat', type=int, default=1) parser.add_argument('--input_dropout', type=float, default=0, help='') parser.add_argument('--init_method', type=str, default='km', help='xavier,km,xi,fixup') parser.add_argument('--lnv', type=str, default='origin', help='layernorm,adanorm')
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--encoder-layers', type=int, metavar='L', help='num encoder layers') parser.add_argument('--encoder-embed-dim', type=int, metavar='H', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-attention-heads', type=int, metavar='A', help='num encoder attention heads') parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use for pooler layer') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', type=float, metavar='D', help='dropout probability after activation in FFN') parser.add_argument( '--pooler-dropout', type=float, metavar='D', help='dropout probability in the masked_lm pooler layers') parser.add_argument('--max-positions', type=int, help='number of positional embeddings to learn') parser.add_argument( '--load-checkpoint-heads', action='store_true', help='(re-)register and load heads when loading checkpoints') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( '--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument( '--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument( '--quant-noise-scalar', type=float, metavar='D', default=0, help= 'scalar quantization noise and scalar quantization at training time' ) parser.add_argument( '--untie-weights-roberta', action='store_true', help='Untie weights between embeddings and classifiers in RoBERTa')
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--extractor-mode", choices=["default", "layer_norm"], help= "mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with --normalize)", ) parser.add_argument( "--encoder-layers", type=int, metavar="L", help="num encoder layers in the transformer", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="H", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="F", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="A", help="num encoder attention heads", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability for the transformer", ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN", ) parser.add_argument( "--final-dim", type=int, metavar="D", help= "project final representations and targets to this many dimensions", ) parser.add_argument( "--layer-norm-first", action="store_true", help="apply layernorm first in the transformer", ) parser.add_argument( "--encoder-layerdrop", type=float, help="probability of dropping a tarnsformer layer", ) parser.add_argument( "--conv-feature-layers", type=str, metavar="EXPR", help= "convolutional feature extraction layers [(dim, kernel_size, stride), ...]", ) parser.add_argument("--logit-temp", type=float, help="temperature to divide logits by") parser.add_argument("--quantize-targets", action="store_true", help="use quantized targets") parser.add_argument("--quantize-input", action="store_true", help="use quantized inputs") parser.add_argument( "--same-quantizer", action="store_true", help="use same quantizer for inputs and targets", ) parser.add_argument( "--feature-grad-mult", type=float, help="multiply feature extractor var grads by this", ) parser.add_argument( "--latent-vars", type=int, metavar="N", help="number of latent variables V in each group of the codebook", ) parser.add_argument( "--latent-groups", type=int, metavar="N", help="number of groups G of latent variables in the codebook", ) parser.add_argument( "--latent-dim", type=int, metavar="N", help= "if set, uses this dimensionality for latent variables. otherwise uses final_dim / latent_groups", ) parser.add_argument("--mask-length", type=int, help="mask length") parser.add_argument("--mask-prob", type=float, help="probability of replacing a token with mask") parser.add_argument( "--mask-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--mask-other", type=float, help= "secondary mask argument (used for more complex distributions), see help in compute_mask_indices", ) parser.add_argument( "--no-mask-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--mask-min-space", type=int, help="min space between spans (if no overlap is enabled)", ) parser.add_argument( "--mask-channel-length", type=int, help="repeat the mask indices multiple times", ) parser.add_argument( "--mask-channel-prob", type=float, help="probability of replacing a token with mask", ) parser.add_argument( "--mask-channel-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--mask-channel-other", type=float, help= "secondary mask argument (used for more complex distributions), see help in compute_mask_indices", ) parser.add_argument( "--no-mask-channel-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--mask-channel-min-space", type=int, help="min space between spans (if no overlap is enabled)", ) parser.add_argument( "--dropout-input", type=float, metavar="D", help="dropout to apply to the input (after feat extr)", ) parser.add_argument( "--dropout-features", type=float, metavar="D", help="dropout to apply to the features (after feat extr)", ) parser.add_argument("--num-negatives", type=int, metavar="N", help="number of negative examples") parser.add_argument( "--negatives-from-everywhere", action="store_true", help="sample negatives from everywhere, not just masked states", ) parser.add_argument( "--cross-sample-negatives", type=int, metavar="N", help="num of cross sampled negatives", ) parser.add_argument( "--codebook-negatives", type=int, metavar="N", help="num of codebook sampled negatives", ) parser.add_argument( "--conv-pos", type=int, metavar="N", help="number of filters for convolutional positional embeddings", ) parser.add_argument( "--conv-pos-groups", type=int, metavar="N", help="number of groups for convolutional positional embedding", ) parser.add_argument( "--latent-temp", type=str, metavar="D", help= "temperature for latent variable sampling. can be tuple of 3 values (start, end, decay)", ) parser.add_argument("--target-glu", action="store_true", help="adds projection + glu to targets") parser.add_argument("--conv-bias", action="store_true", help="include bias in conv encoder")
from fairseq import utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import ( BaseFairseqModel, register_model, ) from fairseq.models.roberta.model import RobertaClassificationHead from fairseq.modules import ( LayerNorm, TransformerSentenceEncoder, TransformerSentenceEncoderLayer, ) ACTIVATION_FN_CHOICES = ChoiceEnum(utils.get_available_activation_fns()) JOINT_CLASSIFICATION_CHOICES = ChoiceEnum(["none", "sent"]) SENTENCE_REP_CHOICES = ChoiceEnum(["head", "meanpool", "maxpool"]) def update_init_roberta_model_state(state): """ update the state_dict of a Roberta model for initializing weights of the BertRanker """ for k in list(state.keys()): if ".lm_head." in k or "version" in k: del state[k] continue # remove 'encoder/decoder.sentence_encoder.' from the key assert k.startswith("encoder.sentence_encoder.") or k.startswith(
class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"} ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) relu_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) decoder_embed_dim: int = field( default=512, metadata={"help": "decoder embedding dimension"} ) decoder_output_dim: int = field( default=512, metadata={"help": "decoder output dimension"} ) decoder_input_dim: int = field( default=512, metadata={"help": "decoder input dimension"} ) decoder_ffn_embed_dim: int = field( default=2048, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) decoder_attention_heads: int = field( default=8, metadata={"help": "num decoder attention heads"} ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"} ) no_decoder_final_norm: bool = field( default=False, metadata={"help": "don't add an extra layernorm after the last decoder block"}, ) adaptive_softmax_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0, metadata={"help": "sets adaptive softmax dropout for the tail projections"}, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"} ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) character_embeddings: bool = field( default=False, metadata={ "help": "if set, uses character embedding convolutions to produce token embeddings" }, ) character_filters: str = field( default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", metadata={"help": "size of character embeddings"}, ) character_embedding_dim: int = field( default=4, metadata={"help": "size of character embeddings"} ) char_embedder_highway_layers: int = field( default=2, metadata={"help": "number of highway layers for character token embeddder"}, ) adaptive_input: bool = field( default=False, metadata={"help": "if set, uses adaptive input"} ) adaptive_input_factor: float = field( default=4, metadata={"help": "adaptive input factor"} ) adaptive_input_cutoff: Optional[str] = field( default=None, metadata={"help": "comma separated list of adaptive input cutoff points."}, ) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"} ) decoder_layers_to_keep: Optional[str] = field( default=None, metadata={ "help": "which layers to *keep* when pruning as a comma-separated list" }, ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"} ) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"} ) checkpoint_activations: bool = field( default=False, metadata={"help": "checkpoint activations at each layer"} ) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) quant_noise_pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) # TODO common var add to parent quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("common.tpu")
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument("--encoder-layers", type=int, metavar="L", help="num encoder layers") parser.add_argument( "--encoder-embed-dim", type=int, metavar="H", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="F", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="A", help="num encoder attention heads", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use for pooler layer", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN", ) parser.add_argument( "--pooler-dropout", type=float, metavar="D", help="dropout probability in the masked_lm pooler layers", ) parser.add_argument("--max-positions", type=int, help="number of positional embeddings to learn") parser.add_argument( "--load-checkpoint-heads", action="store_true", help="(re-)register and load heads when loading checkpoints", ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument( "--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder", ) parser.add_argument( "--encoder-layers-to-keep", default=None, help= "which layers to *keep* when pruning as a comma-separated list", ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( "--quant-noise-pq", type=float, metavar="D", default=0, help="iterative PQ quantization noise at training time", ) parser.add_argument( "--quant-noise-pq-block-size", type=int, metavar="D", default=8, help="block size of quantization noise at training time", ) parser.add_argument( "--quant-noise-scalar", type=float, metavar="D", default=0, help= "scalar quantization noise and scalar quantization at training time", ) parser.add_argument( "--untie-weights-roberta", action="store_true", help="Untie weights between embeddings and classifiers in RoBERTa", ) parser.add_argument( "--spectral-norm-classification-head", action="store_true", default=False, help="Apply spectral normalization on the classification head", )
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--encoder-layers", type=int, metavar="L", help="num encoder layers" ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="H", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="F", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="A", help="num encoder attention heads", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use for pooler layer", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN", ) parser.add_argument( "--pooler-dropout", type=float, metavar="D", help="dropout probability in the masked_lm pooler layers", ) parser.add_argument( "--max-positions", type=int, help="number of positional embeddings to learn" ) parser.add_argument( "--load-checkpoint-heads", action="store_true", help="(re-)register and load heads when loading checkpoints", ) parser.add_argument( "--untie-weights-roberta", action="store_true", help="Untie weights between embeddings and classifiers in RoBERTa", ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument( "--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder", ) parser.add_argument( "--encoder-layers-to-keep", default=None, help="which layers to *keep* when pruning as a comma-separated list", ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( "--quant-noise-pq", type=float, metavar="D", default=0, help="iterative PQ quantization noise at training time", ) parser.add_argument( "--quant-noise-pq-block-size", type=int, metavar="D", default=8, help="block size of quantization noise at training time", ) parser.add_argument( "--quant-noise-scalar", type=float, metavar="D", default=0, help="scalar quantization noise and scalar quantization at training time", ) # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020) parser.add_argument( "--spectral-norm-classification-head", action="store_true", default=False, help="Apply spectral normalization on the classification head", ) # args for Fully Sharded Data Parallel (FSDP) training parser.add_argument( "--min-params-to-wrap", type=int, metavar="D", default=DEFAULT_MIN_PARAMS_TO_WRAP, help=( "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." ), ) # args for AdaPruning # In short, it adds regularizarion for the multihead attention module and feed forward neural nets # For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI parser.add_argument( "--mha-reg-scale-factor", type=float, metavar="D", default=0.0, help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", ) parser.add_argument( "--ffn-reg-scale-factor", type=float, metavar="D", default=0.0, help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", )
class Wav2Vec2Config(FairseqDataclass): extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group norm with d " "groups in the first conv block, whereas layer_norm has layer norms in " "every block (meant to use with normalize=True)" }, ) encoder_layers: int = field( default=12, metadata={"help": "num encoder layers in the transformer"}) encoder_embed_dim: int = field( default=768, metadata={"help": "encoder embedding dimension"}) encoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "encoder embedding dimension for FFN"}) encoder_attention_heads: int = field( default=12, metadata={"help": "num encoder attention heads"}) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"}) # dropouts dropout: float = field( default=0.1, metadata={"help": "dropout probability for the transformer"}) attention_dropout: float = field( default=0.1, metadata={"help": "dropout probability for attention weights"}) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"}) encoder_layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={ "help": "dropout to apply to the features (after feat extr)" }, ) final_dim: int = field( default=0, metadata={ "help": "project final representations and targets to this many dimensions." "set to encoder_embed_dim is <= 0" }, ) layer_norm_first: bool = field( default=False, metadata={"help": "apply layernorm first in the transformer"}) conv_feature_layers: str = field( default= "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", metadata={ "help": "string describing convolutional feature extraction layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_bias: bool = field(default=False, metadata={"help": "include bias in conv encoder"}) logit_temp: float = field( default=0.1, metadata={"help": "temperature to divide logits by"}) quantize_targets: bool = field(default=False, metadata={"help": "use quantized targets"}) quantize_input: bool = field(default=False, metadata={"help": "use quantized inputs"}) same_quantizer: bool = field( default=False, metadata={"help": "use same quantizer for inputs and targets"}) target_glu: bool = field( default=False, metadata={"help": "adds projection + glu to targets"}) feature_grad_mult: float = field( default=1.0, metadata={"help": "multiply feature extractor var grads by this"}) quantizer_depth: int = field( default=1, metadata={"help": "number of quantizer layers"}, ) quantizer_factor: int = field( default=3, metadata={ "help": "dimensionality increase for inner quantizer layers (if depth > 1)" }, ) latent_vars: int = field( default=320, metadata={ "help": "number of latent variables V in each group of the codebook" }, ) latent_groups: int = field( default=2, metadata={ "help": "number of groups G of latent variables in the codebook" }, ) latent_dim: int = field( default=0, metadata={ "help": "if > 0, uses this dimensionality for latent variables. " "otherwise uses final_dim / latent_groups" }, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, metadata={"help": "probability of replacing a token with mask"}) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length"}) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"}) mask_min_space: int = field( default=1, metadata={ "help": "min space between spans (if no overlap is enabled)" }, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}) mask_channel_before: bool = False mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}) mask_channel_min_space: int = field( default=1, metadata={ "help": "min space between spans (if no overlap is enabled)" }, ) # negative selection num_negatives: int = field( default=100, metadata={"help": "number of negative examples from the same sample"}, ) negatives_from_everywhere: bool = field( default=False, metadata={ "help": "sample negatives from everywhere, not just masked states" }, ) cross_sample_negatives: int = field( default=0, metadata={"help": "number of negative examples from the any sample"}) codebook_negatives: int = field( default=0, metadata={"help": "number of negative examples codebook"}) # positional embeddings conv_pos: int = field( default=128, metadata={ "help": "number of filters for convolutional positional embeddings" }, ) conv_pos_groups: int = field( default=16, metadata={ "help": "number of groups for convolutional positional embedding" }, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), metadata={ "help": "temperature for latent variable sampling. " "can be tuple of 3 values (start, end, decay)" }, ) checkpoint_activations: bool = field( default=False, metadata={ "help": "recompute activations and save memory for extra compute" }, )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') parser.add_argument( '--layer-wise-attention', default=False, action='store_true', help= 'perform layer-wise attention (cross-attention or cross+self-attention)' ) # adanorm parser.add_argument('--lnv', type=str, default='origin', help='origin, no_norm, topk, adanorm,nowb') parser.add_argument( '--sigma', type=float, default=0.005, ) parser.add_argument('--adanorm_scale', type=float, default=2.0, help='') parser.add_argument('--nowb_scale', type=float, default=1.0, help='') parser.add_argument('--mean_detach', type=int, default=0, help='') parser.add_argument('--std_detach', type=int, default=0, help='') parser.add_argument('--init_method', type=str, default='xavier', help='xavier,km,xi') parser.add_argument('--init_topk_rho', type=float, default=0) parser.add_argument('--big_km', type=int, default=0) parser.add_argument( '--big_km_list', type=str, nargs='+', default=['in', 'out', 'fc1', 'fc2', 'qkv', 'attn_out'], help='')
def add_args(parser): """Add model-specific arguments to the parser.""" # Arguments related to dropout parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for' ' attention weights') parser.add_argument('--act-dropout', type=float, metavar='D', help='dropout probability after' ' activation in FFN') # Arguments related to hidden states and self-attention parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--bias-kv', action='store_true', help='if set, adding a learnable bias kv') parser.add_argument('--zero-attn', action='store_true', help='if set, pads attn with zero') # Arguments related to input and output embeddings parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--share-encoder-input-output-embed', action='store_true', help='share encoder input' ' and output embeddings') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--no-token-positional-embeddings', action='store_true', help='if set, disables positional embeddings' ' (outside self attention)') parser.add_argument('--num-segment', type=int, metavar='N', help='num segment in the input') # Arguments related to sentence level prediction parser.add_argument('--sentence-class-num', type=int, metavar='N', help='number of classes for sentence task') parser.add_argument('--sent-loss', action='store_true', help='if set,' ' calculate sentence level predictions') # Arguments related to parameter initialization parser.add_argument('--apply-bert-init', action='store_true', help='use custom param initialization for BERT') # misc params parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument( '--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='Which activation function to use for pooler layer.') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') parser.add_argument('--layer-wise-attention', default=False, action='store_true', help='perform layer-wise attention (cross-attention or cross+self-attention)') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument('--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use") parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution's out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution's kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution's strides") parser.add_argument("--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights") parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.") parser.add_argument("--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN") parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument("--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads") parser.add_argument("--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block") parser.add_argument( "--encoder-transformer-context", type=str, metavar="EXPR", help="left/right context for time-restricted self-attention; " "can be None or a tuple of two non-negative integers/None") parser.add_argument( "--no-token-positional-embeddings", action="store_true", help= "if set, disables positional embeddings (outside self attention)") parser.add_argument("--layernorm-embedding", action="store_true", help="add layernorm to embedding") parser.add_argument( "--checkpoint-activations", action="store_true", help="checkpoint activations at each layer, which saves GPU " "memory usage at the cost of some additional compute") parser.add_argument( "--offload-activations", action="store_true", help= "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations." ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument("--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder") parser.add_argument( "--encoder-layers-to-keep", default=None, help="which layers to *keep* when pruning as a comma-separated list" ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( "--quant-noise-pq", type=float, metavar="D", default=0, help="iterative PQ quantization noise at training time") parser.add_argument( "--quant-noise-pq-block-size", type=int, metavar="D", default=8, help="block size of quantization noise at training time") parser.add_argument( "--quant-noise-scalar", type=float, metavar="D", default=0, help= "scalar quantization noise and scalar quantization at training time" ) # args for Fully Sharded Data Parallel (FSDP) training parser.add_argument( "--min-params-to-wrap", type=int, metavar="D", default=DEFAULT_MIN_PARAMS_TO_WRAP, help= ("minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed."))