Exemple #1
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout', type=float, metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout', type=float, metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers', type=int, metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before', action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--encoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers', type=int, metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before', action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                         help='decoder output dimension (extra linear layer '
                              'if different from decoder embed dim')
     parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings', action='store_true',
                         help='share encoder, decoder and output embeddings'
                              ' (requires shared dictionary and embed dim)')
     parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                         help='if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                         help='comma separated list of adaptive softmax cutoff points. '
                              'Must be used with adaptive_loss criterion'),
     parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--layernorm-embedding', action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding', action='store_true',
                         help='if True, dont scale embeddings')
     parser.add_argument('--checkpoint-activations', action='store_true',
                         help='checkpoint activations at each layer, which saves GPU '
                              'memory usage at the cost of some additional compute')
     parser.add_argument('--offload-activations', action='store_true',
                         help='checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations.')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention', default=False, action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention', default=False, action='store_true',
                         help='perform cross+self-attention')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument('--encoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     parser.add_argument('--decoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                         help='iterative PQ quantization noise at training time')
     parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                         help='block size of quantization noise at training time')
     parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                         help='scalar quantization noise and scalar quantization at training time')
     # args for Fully Sharded Data Parallel (FSDP) training
     parser.add_argument(
         '--min-params-to-wrap', type=int, metavar='D', default=DEFAULT_MIN_PARAMS_TO_WRAP,
         help=(
             'minimum number of params for a layer to be wrapped with FSDP() when '
             'training with --ddp-backend=fully_sharded. Smaller values will '
             'improve memory efficiency, but may make torch.distributed '
             'communication less efficient due to smaller input sizes. This option '
             'is set to 0 (i.e., always wrap) when --checkpoint-activations or '
             '--offload-activations are passed.'
         )
     )
Exemple #2
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
Exemple #3
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # input
     parser.add_argument(
         "--conv-kernel-sizes",
         type=str,
         metavar="N",
         help="kernel sizes of Conv1d subsampling layers",
     )
     parser.add_argument(
         "--conv-channels",
         type=int,
         metavar="N",
         help="# of channels in Conv1d subsampling layers",
     )
     # Transformer
     parser.add_argument(
         "--activation-fn",
         type=str,
         default="relu",
         choices=utils.get_available_activation_fns(),
         help="activation function to use",
     )
     parser.add_argument(
         "--dropout", type=float, metavar="D", help="dropout probability"
     )
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--activation-dropout",
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN.",
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--encoder-layers", type=int, metavar="N", help="num encoder layers"
     )
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="N",
         help="num encoder attention heads",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--decoder-layers", type=int, metavar="N", help="num decoder layers"
     )
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--layernorm-embedding",
         action="store_true",
         help="add layernorm to embedding",
     )
     parser.add_argument(
         "--no-scale-embedding",
         action="store_true",
         help="if True, dont scale embeddings",
     )
     parser.add_argument(
         "--load-pretrained-encoder-from",
         type=str,
         metavar="STR",
         help="model to take encoder weights from (for initialization)",
     )
Exemple #4
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument(
         "--num-segments",
         type=int,
         metavar="N",
         help="num segments",
     )
     parser.add_argument(
         "--encoder-layers",
         type=int,
         metavar="L",
         help="num encoder layers",
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="H",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="F",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="A",
         help="num encoder attention heads",
     )
     parser.add_argument(
         "--activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use",
     )
     parser.add_argument(
         "--pooler-activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use for pooler layer",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--dropout",
         type=float,
         metavar="D",
         help="dropout probability",
     )
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--activation-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN",
     )
     parser.add_argument(
         "--pooler-dropout",
         type=float,
         metavar="D",
         help="dropout probability in the masked_lm pooler layers",
     )
     parser.add_argument(
         "--max-positions",
         type=int,
         help="number of positional embeddings to learn",
     )
     parser.add_argument(
         "--load-checkpoint-heads",
         action="store_true",
         help="(re-)register and load heads when loading checkpoints",
     )
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument(
         "--encoder-layerdrop",
         type=float,
         metavar="D",
         default=0,
         help="LayerDrop probability for encoder",
     )
     parser.add_argument(
         "--encoder-layers-to-keep",
         default=None,
         help=
         "which layers to *keep* when pruning as a comma-separated list",
     )
Exemple #5
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout', type=float, metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout', type=float, metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers', type=int, metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before', action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--encoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers', type=int, metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before', action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                         help='decoder output dimension (extra linear layer '
                              'if different from decoder embed dim')
     parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings', action='store_true',
                         help='share encoder, decoder and output embeddings'
                              ' (requires shared dictionary and embed dim)')
     parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                         help='if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                         help='comma separated list of adaptive softmax cutoff points. '
                              'Must be used with adaptive_loss criterion'),
     parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--layernorm-embedding', action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding', action='store_true',
                         help='if True, dont scale embeddings')
     parser.add_argument('--checkpoint-activations', action='store_true',
                         help='checkpoint activations at each layer, which saves GPU '
                              'memory usage at the cost of some additional compute')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention', default=False, action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention', default=False, action='store_true',
                         help='perform cross+self-attention')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument('--encoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     parser.add_argument('--decoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                         help='iterative PQ quantization noise at training time')
     parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                         help='block size of quantization noise at training time')
     parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                         help='scalar quantization noise and scalar quantization at training time')
     # fmt: on
     parser.add_argument(
         "--pretrained-roberta-checkpoint-folder",
         type=str,
         metavar="STR",
         help="roberta model to use for initializing transformer encoder",
     )
Exemple #6
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-output-dim',
                         type=int,
                         metavar='N',
                         help='decoder output dimension')
     parser.add_argument('--decoder-input-dim',
                         type=int,
                         metavar='N',
                         help='decoder input dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--no-decoder-final-norm',
         action='store_true',
         help='don\'t add an extra layernorm after the last decoder block')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion')
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--adaptive-softmax-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--no-token-positional-embeddings',
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument(
         '--character-embeddings',
         action='store_true',
         help=
         'if set, uses character embedding convolutions to produce token embeddings'
     )
     parser.add_argument(
         '--character-filters',
         type=str,
         metavar='LIST',
         default=
         '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
         help='size of character embeddings')
     parser.add_argument('--character-embedding-dim',
                         default=4,
                         type=int,
                         metavar='N',
                         help='size of character embeddings')
     parser.add_argument(
         '--char-embedder-highway-layers',
         default=2,
         type=int,
         metavar='N',
         help='number of highway layers for character token embeddder')
     parser.add_argument('--adaptive-input',
                         action='store_true',
                         help='if set, uses adaptive input')
     parser.add_argument('--adaptive-input-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--adaptive-input-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive input cutoff points.')
     parser.add_argument(
         '--tie-adaptive-weights',
         action='store_true',
         help=
         'if set, ties the weights of adaptive softmax and adaptive input')
     parser.add_argument(
         '--tie-adaptive-proj',
         action='store_true',
         help=
         'if set, ties the projection weights of adaptive softmax and adaptive input'
     )
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--layernorm-embedding',
                         action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding',
                         action='store_true',
                         help='if True, dont scale embeddings')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--decoder-layerdrop',
                         type=float,
                         metavar='D',
                         help='LayerDrop probability for decoder')
     parser.add_argument(
         '--decoder-layers-to-keep',
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         '--quant-noise-pq',
         type=float,
         metavar='D',
         help='iterative PQ quantization noise at training time')
     parser.add_argument(
         '--quant-noise-pq-block-size',
         type=int,
         metavar='D',
         help='block size of quantization noise at training time')
     parser.add_argument(
         '--quant-noise-scalar',
         type=float,
         metavar='D',
         help=
         'scalar quantization noise and scalar quantization at training time'
     )
Exemple #7
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='L',
                         help='num encoder layers')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='H',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='F',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='A',
                         help='num encoder attention heads')
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--pooler-activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use for pooler layer')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after activation in FFN')
     parser.add_argument(
         '--pooler-dropout',
         type=float,
         metavar='D',
         help='dropout probability in the masked_lm pooler layers')
     parser.add_argument('--max-positions',
                         type=int,
                         help='number of positional embeddings to learn')
     parser.add_argument(
         '--load-checkpoint-heads',
         action='store_true',
         help='(re-)register and load heads when loading checkpoints')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
Exemple #8
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # wav2vec encoder
        Wav2VecEncoderWithAdaptor.add_args(parser)
        # add_decoder_args(parser)
        # mbart Transformer
        parser.add_argument(
            "--activation-fn",
            type=str,
            default="relu",
            choices=utils.get_available_activation_fns(),
            help="activation function to use",
        )

        parser.add_argument("--mbart-dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--mbart-attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--mbart-activation-dropout",
            type=float,
            metavar="D",
            help="dropout probability after activation in FFN.",
        )

        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="num encoder layers")
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            action="store_true",
            help="apply layernorm before each encoder block",
        )

        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension for FFN",
        )
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="num decoder layers")
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads",
        )
        parser.add_argument(
            "--decoder-normalize-before",
            action="store_true",
            help="apply layernorm before each decoder block",
        )
        parser.add_argument(
            "--layernorm-embedding",
            action="store_true",
            help="add layernorm to embedding",
        )
        parser.add_argument(
            "--no-scale-embedding",
            action="store_true",
            help="if True, dont scale embeddings",
        )
        parser.add_argument(
            "--load-pretrained-mbart-from",
            type=str,
            metavar="STR",
            help=
            "model to take text encoder decoder weights from (for initialization)",
        )
        # parser.add_argument("--finetune-w2v-params", type=str, metavar="STR",
        #                    help="comma-separated param strings to finetune.")
        parser.add_argument(
            "--finetune-mbart-decoder-params",
            type=str,
            metavar="STR",
            help="comma-separated param strings to finetune.",
        )
        parser.add_argument(
            "--finetune-mbart-encoder-params",
            type=str,
            metavar="STR",
            help="comma-separated param strings to finetune.",
        )
        parser.add_argument(
            "--skip-encoder-projection",
            action="store_true",
            help="skip the projection layer in encoder",
        )

        parser.add_argument(
            "--enc-grad-mult",
            type=float,
            metavar="V",
            default=1.0,
            help="multiply enc1 and enc2 gradient by V",
        )
        parser.add_argument(
            "--enc2-along-grad-mult",
            type=float,
            metavar="V",
            default=1.0,
            help="multiply enc2 gradient by V if only enc2 is used",
        )
        parser.add_argument(
            "--text-input-cost-ratio",
            type=float,
            default=1.0,
            metavar="V",
            help="text input cost ratio relative to speech input cost",
        )
        parser.add_argument(
            "--stack-w2v-mbart-encoder",
            action="store_true",
            help="stack w2v and mbart encoder",
        )
        parser.add_argument(
            "--stack-w2v-mbart-nonorm-encoder",
            action="store_true",
            help="stack w2v and mbart encoder",
        )
        parser.add_argument("--no-final-norm-decoder",
                            action="store_true",
                            help="no layer norm")
        parser.add_argument(
            "--drop-w2v-layers",
            type=int,
            default=0,
            metavar="N",
            help="drop w2v encoder layers",
        )

        parser.add_argument(
            "--share-w2v-text-encoder",
            action="store_true",
            help="share w2v encoder layers with text encoder",
        )
        parser.add_argument(
            "--shared-w2v-layers",
            type=int,
            default=0,
            metavar="N",
            help="shared encoder layers from w2v encoder",
        )
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument(
         "--input-feat-per-channel",
         type=int,
         metavar="N",
         help="encoder input dimension per input channel",
     )
     parser.add_argument(
         "--activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use",
     )
     parser.add_argument(
         "--dropout", type=float, metavar="D", help="dropout probability"
     )
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--activation-dropout",
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN.",
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--encoder-layers", type=int, metavar="N", help="num encoder layers"
     )
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="N",
         help="num encoder attention heads",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--decoder-layers", type=int, metavar="N", help="num decoder layers"
     )
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--decoder-output-dim",
         type=int,
         metavar="N",
         help="decoder output dimension (extra linear layer if different from decoder embed dim)",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--layernorm-embedding",
         action="store_true",
         help="add layernorm to embedding",
     )
     parser.add_argument(
         "--no-scale-embedding",
         action="store_true",
         help="if True, dont scale embeddings",
     )
     parser.add_argument(
         "--load-pretrained-encoder-from",
         type=str,
         metavar="STR",
         help="model to take encoder weights from (for initialization)",
     )
     parser.add_argument(
         "--load-pretrained-decoder-from",
         type=str,
         metavar="STR",
         help="model to take decoder weights from (for initialization)",
     )
     parser.add_argument(
         "--conv-out-channels",
         type=int,
         metavar="INT",
         help="the number of output channels of conv layer",
     )
Exemple #10
0
def parse_args(parser):
    

    parser.add_argument("--data_dir",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--save_dir",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--data_file",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--test_data_file",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--feature_file",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--test_feature_file",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--world_size",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--gpu_size",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--valid_size",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--batch_size",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--log_file",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--field",
                    type=str,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--model_file",
                    type=str,
                    help="local_rank for distributed training on gpus")

    parser.add_argument("--batch_t",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--iteration",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--epoch",
                    type=int,
                    default=1,
                    help="local_rank for distributed training on gpus")
    parser.add_argument("--batch_one_epoch",
                    type=int,
                    help="local_rank for distributed training on gpus")
    parser.add_argument('--use_start_pos', action='store_true',
                        help='apply layernorm before each encoder block')
    parser.add_argument('--from_epoch', action='store_true',
                        help='apply layernorm before each encoder block')

    parser.add_argument("--all_batch_loss",
                    type=float,
                    help="local_rank for distributed training on gpus")


#     return parser.parse_args()



# def parse_args_model(parser):
    parser.add_argument('--activation-fn',
                            choices=fairseq_utils.get_available_activation_fns(),
                            help='activation function to use')
    parser.add_argument('--dropout', type=float, metavar='D',
                        help='dropout probability')
    parser.add_argument('--attention-dropout', type=float, metavar='D',
                        help='dropout probability for attention weights')
    parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                        help='dropout probability after activation in FFN.')
    parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                        help='path to pre-trained encoder embedding')
    parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                        help='encoder embedding dimension')
    parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                        help='encoder embedding dimension for FFN')
    parser.add_argument('--encoder-layers', type=int, metavar='N',
                        help='num encoder layers')
    parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                        help='num encoder attention heads')
    parser.add_argument('--encoder-normalize-before', action='store_true',
                        help='apply layernorm before each encoder block')
    parser.add_argument('--encoder-learned-pos', action='store_true',
                        help='use learned positional embeddings in the encoder')
    parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                        help='path to pre-trained decoder embedding')
    parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                        help='decoder embedding dimension')
    parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                        help='decoder embedding dimension for FFN')
    parser.add_argument('--decoder-layers', type=int, metavar='N',
                        help='num decoder layers')
    parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                        help='num decoder attention heads')
    parser.add_argument('--decoder-learned-pos', action='store_true',
                        help='use learned positional embeddings in the decoder')
    parser.add_argument('--decoder-normalize-before', action='store_true',
                        help='apply layernorm before each decoder block')
    parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                        help='decoder output dimension (extra linear layer '
                             'if different from decoder embed dim')
    parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                        help='share decoder input and output embeddings')
    parser.add_argument('--share-all-embeddings', action='store_true',
                        help='share encoder, decoder and output embeddings'
                             ' (requires shared dictionary and embed dim)')
    parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                        help='if set, disables positional embeddings (outside self attention)')
    parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                        help='comma separated list of adaptive softmax cutoff points. '
                             'Must be used with adaptive_loss criterion'),
    parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                        help='sets adaptive softmax dropout for the tail projections')
    parser.add_argument('--layernorm-embedding', action='store_true',
                        help='add layernorm to embedding')
    parser.add_argument('--no-scale-embedding', action='store_true',
                        help='if True, dont scale embeddings')
    # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
    parser.add_argument('--no-cross-attention', default=False, action='store_true',
                        help='do not perform cross-attention')
    parser.add_argument('--cross-self-attention', default=False, action='store_true',
                        help='perform cross+self-attention')
    # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
    parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                        help='LayerDrop probability for encoder')
    parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                        help='LayerDrop probability for decoder')
    parser.add_argument('--encoder-layers-to-keep', default=None,
                        help='which layers to *keep* when pruning as a comma-separated list')
    parser.add_argument('--decoder-layers-to-keep', default=None,
                        help='which layers to *keep* when pruning as a comma-separated list')
    # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
    parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                        help='iterative PQ quantization noise at training time')
    parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                        help='block size of quantization noise at training time')
    parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                        help='scalar quantization noise and scalar quantization at training time')



    return parser.parse_args()
Exemple #11
0
class TransformerConfig(FairseqDataclass):
    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
        default="relu",
        metadata={"help": "activation function to use"},
    )
    dropout: float = field(default=0.1,
                           metadata={"help": "dropout probability"})
    attention_dropout: float = field(
        default=0.0,
        metadata={"help": "dropout probability for attention weights"})
    activation_dropout: float = field(
        default=0.0,
        metadata={
            "help": "dropout probability after activation in FFN.",
            "alias": "--relu-dropout",
        },
    )
    adaptive_input: bool = False
    encoder: EncDecBaseConfig = EncDecBaseConfig()
    # TODO should really be in the encoder config
    max_source_positions: int = field(
        default=DEFAULT_MAX_SOURCE_POSITIONS,
        metadata={"help": "Maximum input length supported by the encoder"},
    )
    decoder: DecoderConfig = DecoderConfig()
    # TODO should really be in the decoder config
    max_target_positions: int = field(
        default=DEFAULT_MAX_TARGET_POSITIONS,
        metadata={"help": "Maximum output length supported by the decoder"},
    )
    share_decoder_input_output_embed: bool = field(
        default=False,
        metadata={"help": "share decoder input and output embeddings"})
    share_all_embeddings: bool = field(
        default=False,
        metadata={
            "help":
            "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)"
        },
    )
    merge_src_tgt_embed: bool = field(
        default=False,
        metadata={
            "help":
            "if true then the source and target embedding table is "
            "merged into one table. This is going to make the model smaller but "
            "it might hurt performance."
        })
    no_token_positional_embeddings: bool = field(
        default=False,
        metadata={
            "help":
            "if True, disables positional embeddings (outside self attention)"
        },
    )
    adaptive_softmax_cutoff: Optional[List[int]] = field(
        default=None,
        metadata={
            "help":
            "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion"
        },
    )
    adaptive_softmax_dropout: float = field(
        default=0.0,
        metadata={
            "help": "sets adaptive softmax dropout for the tail projections"
        },
    )
    adaptive_softmax_factor: float = field(
        default=4, metadata={"help": "adaptive input factor"})
    layernorm_embedding: bool = field(
        default=False, metadata={"help": "add layernorm to embedding"})
    tie_adaptive_weights: bool = field(
        default=False,
        metadata={
            "help":
            "if set, ties the weights of adaptive softmax and adaptive input"
        },
    )
    tie_adaptive_proj: bool = field(
        default=False,
        metadata={
            "help":
            "if set, ties the projection weights of adaptive softmax and adaptive input"
        },
    )
    no_scale_embedding: bool = field(
        default=False, metadata={"help": "if True, dont scale embeddings"})
    checkpoint_activations: bool = field(
        default=False,
        metadata={
            "help":
            "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute"
        },
    )
    offload_activations: bool = field(
        default=False,
        metadata={
            "help":
            "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations."
        },
    )
    # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
    no_cross_attention: bool = field(
        default=False, metadata={"help": "do not perform cross-attention"})
    cross_self_attention: bool = field(
        default=False, metadata={"help": "perform cross+self-attention"})
    # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
    quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig())
    min_params_to_wrap: int = field(
        default=DEFAULT_MIN_PARAMS_TO_WRAP,
        metadata={
            "help":
            "minimum number of params for a layer to be wrapped with FSDP() when "
            "training with --ddp-backend=fully_sharded. Smaller values will "
            "improve memory efficiency, but may make torch.distributed "
            "communication less efficient due to smaller input sizes. This option "
            "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
            "--offload-activations are passed."
        },
    )
    # DEPRECATED field, but some old checkpoints might have it
    char_inputs: bool = field(
        default=False,
        metadata={"help": "if set, model takes character ids as input"})
    relu_dropout: float = 0.0
    # config for "BASE Layers: Simplifying Training of Large, Sparse Models"
    base_layers: Optional[int] = field(
        default=0, metadata={"help": "number of BASE layers in total"})
    base_sublayers: Optional[int] = field(
        default=1, metadata={"help": "number of sublayers in each BASE layer"})
    base_shuffle: Optional[int] = field(
        default=1,
        metadata={
            "help":
            "shuffle tokens between workers before computing assignment"
        },
    )

    export: bool = field(
        default=False,
        metadata={"help": "make the layernorm exportable with torchscript."},
    )

    # copied from transformer_lm but expected in transformer_decoder:
    no_decoder_final_norm: bool = field(
        default=False,
        metadata={
            "help": "don't add an extra layernorm after the last decoder block"
        },
    )

    # We need to make this hierarchical dataclass like the flat namespace
    # __getattr__ and __setattr__ here allow backward compatibility
    # for subclasses of Transformer(Legacy) that depend on read/write on
    # the flat namespace.

    def __getattr__(self, name):
        match = re.match(_NAME_PARSER, name)
        if match:
            sub = safe_getattr(self, match[1])
            return safe_getattr(sub, match[2])
        raise AttributeError(f"invalid argument {name}.")

    def __setattr__(self, name, value):
        match = re.match(_NAME_PARSER, name)
        if match:
            sub = safe_getattr(self, match[1])
            setattr(sub, match[2], value)
        else:
            super().__setattr__(name, value)

    @staticmethod
    def _copy_keys(args, cls, prefix, seen):
        """
        copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim
        """
        cfg = cls()
        for fld in fields(cls):
            # for all the fields in the DC, find the fields (e.g. embed_dim)
            # in the namespace with the prefix (e.g. decoder)
            # and set it on the dc.
            args_key = f"{prefix}_{fld.name}"
            if safe_hasattr(args, args_key):
                seen.add(args_key)
                setattr(cfg, fld.name, safe_getattr(args, args_key))
            if safe_hasattr(args, fld.name):
                seen.add(fld.name)
                setattr(cfg, fld.name, safe_getattr(args, fld.name))
        return cfg

    @classmethod
    def from_namespace(cls, args):
        if args is None:
            return None
        if not isinstance(args, cls):
            seen = set()
            config = cls()
            # currently, we can go generically from DC fields to args hierarchically
            # but we can't easily deconstruct a flat namespace to a hierarchical
            # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not
            # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple
            # for now.
            for fld in fields(cls):
                # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields
                # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()`
                if fld.name == "decoder":
                    if safe_hasattr(args, "decoder"):
                        #  in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC
                        seen.add("decoder")
                        config.decoder = DecoderConfig(**args.decoder)
                    else:
                        config.decoder = cls._copy_keys(
                            args, DecoderConfig, "decoder", seen)
                elif fld.name == "encoder":
                    # same but for encoder
                    if safe_hasattr(args, "encoder"):
                        seen.add("encoder")
                        config.encoder = EncDecBaseConfig(**args.encoder)
                    else:
                        config.encoder = cls._copy_keys(
                            args, EncDecBaseConfig, "encoder", seen)
                elif fld.name == "quant_noise":
                    # same but for quant_noise
                    if safe_hasattr(args, "quant_noise"):
                        seen.add("quant_noise")
                        config.quant_noise = QuantNoiseConfig(
                            **args.quant_noise)
                    else:
                        config.quant_noise = cls._copy_keys(
                            args, QuantNoiseConfig, "quant_noise", seen)
                elif safe_hasattr(args, fld.name):
                    # if it's not a structure field, it's just a normal field, copy it over
                    seen.add(fld.name)
                    setattr(config, fld.name, safe_getattr(args, fld.name))
            # we got all the fields defined in the dataclass, but
            # the argparse namespace might have extra args for two reasons:
            #   - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this
            #   - some places expect args to be there but never define them
            args_dict = (args._asdict() if safe_hasattr(args, "_asdict") else
                         vars(args) if safe_hasattr(args, "__dict__") else {}
                         )  # namedtupled doesn't have __dict__ :-/
            for key, value in args_dict.items():
                if key not in seen:
                    setattr(config, key, value)
            return config
        else:
            return args
Exemple #12
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument('--activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='activation function to use')
        parser.add_argument('--dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for attention weights')
        parser.add_argument(
            '--activation-dropout',
            '--relu-dropout',
            type=float,
            metavar='D',
            help='dropout probability after activation in FFN.')
        parser.add_argument('--encoder-embed-path',
                            type=str,
                            metavar='STR',
                            help='path to pre-trained encoder embedding')
        parser.add_argument('--encoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers',
                            type=int,
                            metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num encoder attention heads')
        parser.add_argument('--encoder-normalize-before',
                            action='store_true',
                            help='apply layernorm before each encoder block')
        parser.add_argument('--decoder-final-norm',
                            default=False,
                            action='store_true',
                            help='apply layernorm before each decoder block')
        parser.add_argument(
            '--encoder-learned-pos',
            action='store_true',
            help='use learned positional embeddings in the encoder')
        parser.add_argument('--decoder-embed-path',
                            type=str,
                            metavar='STR',
                            help='path to pre-trained decoder embedding')
        parser.add_argument('--decoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension for FFN')
        parser.add_argument('--decoder-layers',
                            type=int,
                            metavar='N',
                            help='num decoder layers')
        parser.add_argument('--decoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num decoder attention heads')
        parser.add_argument(
            '--decoder-learned-pos',
            action='store_true',
            help='use learned positional embeddings in the decoder')
        parser.add_argument('--decoder-normalize-before',
                            action='store_true',
                            help='apply layernorm before each decoder block')
        parser.add_argument('--share-decoder-input-output-embed',
                            action='store_true',
                            help='share decoder input and output embeddings')
        parser.add_argument('--share-all-embeddings',
                            action='store_true',
                            help='share encoder, decoder and output embeddings'
                            ' (requires shared dictionary and embed dim)')
        parser.add_argument(
            '--no-token-positional-embeddings',
            default=False,
            action='store_true',
            help=
            'if set, disables positional embeddings (outside self attention)')
        parser.add_argument(
            '--adaptive-softmax-cutoff',
            metavar='EXPR',
            help='comma separated list of adaptive softmax cutoff points. '
            'Must be used with adaptive_loss criterion'),
        parser.add_argument(
            '--adaptive-softmax-dropout',
            type=float,
            metavar='D',
            help='sets adaptive softmax dropout for the tail projections')

        parser.add_argument('--use_att',
                            type=str,
                            nargs='+',
                            default=[
                                'es',
                                'ds',
                                'dc',
                            ],
                            help='')
        parser.add_argument('--combine',
                            type=int,
                            default=0,
                            help='0 as usual  1 combine residual')
        parser.add_argument('--kernel_size',
                            type=int,
                            default=0,
                            help='do not set static kernel')
        parser.add_argument(
            '--attn_dynamic_type',
            type=int,
            default=0,
            help=
            '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use  dynamic kernel '
        )
        parser.add_argument('--attn_cat_relu', type=int, default=0)
        parser.add_argument(
            '--attn_wide_kernels',
            type=lambda x: options.eval_str_list(x, int),
            help='list of kernel size (default: "[3,15]") for wide and gate')
        parser.add_argument('--weight-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for conv weights')
        parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1')
        parser.add_argument(
            '--dynamic_depth_kernels',
            type=lambda x: options.eval_str_list(x, int),
            help=
            'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn'
        )
        parser.add_argument('--dynamic_padding',
                            type=int,
                            default=0,
                            help='padding before dynamic conv')
        parser.add_argument('--attn_dynamic_cat', type=int, default=1)
        parser.add_argument('--bm',
                            type=int,
                            default=0,
                            help='whether to use transformer_bm')
        parser.add_argument('--bm_in_a',
                            type=float,
                            default=3,
                            help='sqrt(6/(1+a)),-1 for xavier')
        parser.add_argument('--bm_out_a',
                            type=float,
                            default=0,
                            help='sqrt(6/(1+a)), -1 for xavier')
        parser.add_argument('--bm_fc3', type=float, default=1, help='')
        parser.add_argument('--bm_fc4', type=float, default=1, help='')
        parser.add_argument('--input_dropout', type=float, default=0, help='')
        parser.add_argument('--init_method',
                            type=str,
                            default='km',
                            help='xavier,km,xi,fixup')
        parser.add_argument('--lnv',
                            type=str,
                            default='origin',
                            help='layernorm,adanorm')
Exemple #13
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument("--activation-fn",
                         choices=utils.get_available_activation_fns(),
                         help="activation function to use")
     parser.add_argument("--dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability")
     parser.add_argument("--encoder-conv-channels",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's out channels")
     parser.add_argument("--encoder-conv-kernel-sizes",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's kernel sizes")
     parser.add_argument("--encoder-conv-strides",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's strides")
     parser.add_argument("--attention-dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability for attention weights")
     parser.add_argument(
         "--activation-dropout",
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN.")
     parser.add_argument("--encoder-ffn-embed-dim",
                         type=int,
                         metavar="N",
                         help="encoder embedding dimension for FFN")
     parser.add_argument("--encoder-layers",
                         type=int,
                         metavar="N",
                         help="num encoder layers")
     parser.add_argument("--encoder-attention-heads",
                         type=int,
                         metavar="N",
                         help="num encoder attention heads")
     parser.add_argument("--encoder-normalize-before",
                         action="store_true",
                         help="apply layernorm before each encoder block")
     parser.add_argument(
         "--encoder-transformer-context",
         type=str,
         metavar="EXPR",
         help="left/right context for time-restricted self-attention; "
         "can be None or a tuple of two non-negative integers/None")
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument("--encoder-layerdrop",
                         type=float,
                         metavar="D",
                         default=0,
                         help="LayerDrop probability for encoder")
     parser.add_argument(
         "--encoder-layers-to-keep",
         default=None,
         help="which layers to *keep* when pruning as a comma-separated list"
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         "--quant-noise-pq",
         type=float,
         metavar="D",
         default=0,
         help="iterative PQ quantization noise at training time")
     parser.add_argument(
         "--quant-noise-pq-block-size",
         type=int,
         metavar="D",
         default=8,
         help="block size of quantization noise at training time")
     parser.add_argument(
         "--quant-noise-scalar",
         type=float,
         metavar="D",
         default=0,
         help=
         "scalar quantization noise and scalar quantization at training time"
     )
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument('--activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='activation function to use')
        parser.add_argument('--dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for attention weights')
        parser.add_argument(
            '--activation-dropout',
            type=float,
            metavar='D',
            help='dropout probability after activation in FFN.')

        parser.add_argument('--encoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers',
                            type=int,
                            metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num encoder attention heads')

        parser.add_argument('--decoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension for FFN')
        parser.add_argument('--decoder-layers',
                            type=int,
                            metavar='N',
                            help='num decoder layers')
        parser.add_argument('--decoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num decoder attention heads')

        parser.add_argument('--share-all-embeddings',
                            action='store_true',
                            help='share encoder, decoder and output embeddings'
                            ' (requires shared dictionary and embed dim)')
        parser.add_argument('--load-from-pretrained-model',
                            type=str,
                            default=None,
                            help='Load from pretrained model')
Exemple #15
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         default=0.1,
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         default=0.,
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-output-dim',
                         type=int,
                         metavar='N',
                         help='decoder output dimension')
     parser.add_argument('--decoder-input-dim',
                         type=int,
                         metavar='N',
                         help='decoder input dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-normalize-before',
                         default=False,
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--no-decoder-final-norm',
         default=False,
         action='store_true',
         help='don\'t add an extra layernorm after the last decoder block')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion')
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--adaptive-softmax-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument(
         '--character-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, uses character embedding convolutions to produce token embeddings'
     )
     parser.add_argument(
         '--character-filters',
         type=str,
         metavar='LIST',
         default=
         '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
         help='size of character embeddings')
     parser.add_argument('--character-embedding-dim',
                         default=4,
                         type=int,
                         metavar='N',
                         help='size of character embeddings')
     parser.add_argument(
         '--char-embedder-highway-layers',
         default=2,
         type=int,
         metavar='N',
         help='number of highway layers for character token embeddder')
     parser.add_argument('--adaptive-input',
                         action='store_true',
                         help='if set, uses adaptive input')
     parser.add_argument('--adaptive-input-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--adaptive-input-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive input cutoff points.')
     parser.add_argument(
         '--tie-adaptive-weights',
         action='store_true',
         help=
         'if set, ties the weights of adaptive softmax and adaptive input')
     parser.add_argument(
         '--tie-adaptive-proj',
         action='store_true',
         help=
         'if set, ties the projection weights of adaptive softmax and adaptive input'
     )
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
Exemple #16
0
class TransformerLanguageModelConfig(FairseqDataclass):
    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
        default="relu", metadata={"help": "activation function to use"})
    dropout: float = field(default=0.1,
                           metadata={"help": "dropout probability"})
    attention_dropout: float = field(
        default=0.0,
        metadata={"help": "dropout probability for attention weights"})
    activation_dropout: float = field(
        default=0.0,
        metadata={"help": "dropout probability after activation in FFN."})
    relu_dropout: float = field(
        default=0.0,
        metadata={"help": "dropout probability after activation in FFN."})
    decoder_embed_dim: int = field(
        default=512, metadata={"help": "decoder embedding dimension"})
    decoder_output_dim: int = field(
        default=512, metadata={"help": "decoder output dimension"})
    decoder_input_dim: int = field(
        default=512, metadata={"help": "decoder input dimension"})
    decoder_ffn_embed_dim: int = field(
        default=2048, metadata={"help": "decoder embedding dimension for FFN"})
    decoder_layers: int = field(default=6,
                                metadata={"help": "num decoder layers"})
    decoder_attention_heads: int = field(
        default=8, metadata={"help": "num decoder attention heads"})
    decoder_normalize_before: bool = field(
        default=False,
        metadata={"help": "apply layernorm before each decoder block"})
    no_decoder_final_norm: bool = field(
        default=False,
        metadata={
            "help": "don't add an extra layernorm after the last decoder block"
        },
    )
    adaptive_softmax_cutoff: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion"
        },
    )
    adaptive_softmax_dropout: float = field(
        default=0,
        metadata={
            "help": "sets adaptive softmax dropout for the tail projections"
        },
    )
    adaptive_softmax_factor: float = field(
        default=4, metadata={"help": "adaptive input factor"})
    no_token_positional_embeddings: bool = field(
        default=False,
        metadata={
            "help":
            "if set, disables positional embeddings (outside self attention)"
        },
    )
    share_decoder_input_output_embed: bool = field(
        default=False,
        metadata={"help": "share decoder input and output embeddings"})
    character_embeddings: bool = field(
        default=False,
        metadata={
            "help":
            "if set, uses character embedding convolutions to produce token embeddings"
        },
    )
    character_filters: str = field(
        default=
        "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
        metadata={"help": "size of character embeddings"},
    )
    character_embedding_dim: int = field(
        default=4, metadata={"help": "size of character embeddings"})
    char_embedder_highway_layers: int = field(
        default=2,
        metadata={
            "help": "number of highway layers for character token embeddder"
        },
    )
    adaptive_input: bool = field(
        default=False, metadata={"help": "if set, uses adaptive input"})
    adaptive_input_factor: float = field(
        default=4, metadata={"help": "adaptive input factor"})
    adaptive_input_cutoff: Optional[str] = field(
        default=None,
        metadata={
            "help": "comma separated list of adaptive input cutoff points."
        },
    )
    tie_adaptive_weights: bool = field(
        default=False,
        metadata={
            "help":
            "if set, ties the weights of adaptive softmax and adaptive input"
        },
    )
    tie_adaptive_proj: bool = field(
        default=False,
        metadata={
            "help":
            "if set, ties the projection weights of adaptive softmax and adaptive input"
        },
    )
    decoder_learned_pos: bool = field(
        default=False,
        metadata={"help": "use learned positional embeddings in the decoder"},
    )
    layernorm_embedding: bool = field(
        default=False, metadata={"help": "add layernorm to embedding"})
    no_scale_embedding: bool = field(
        default=False, metadata={"help": "if True, dont scale embeddings"})
    checkpoint_activations: bool = field(
        default=False,
        metadata={"help": "checkpoint activations at each layer"})
    offload_activations: bool = field(
        default=False,
        metadata={
            "help": "move checkpointed activations to CPU after they are used."
        },
    )
    # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
    decoder_layerdrop: float = field(
        default=0.0, metadata={"help": "LayerDrop probability for decoder"})
    decoder_layers_to_keep: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "which layers to *keep* when pruning as a comma-separated list"
        },
    )
    # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
    quant_noise_pq: float = field(
        default=0.0,
        metadata={"help": "iterative PQ quantization noise at training time"},
    )
    quant_noise_pq_block_size: int = field(
        default=8,
        metadata={"help": "block size of quantization noise at training time"},
    )
    quant_noise_scalar: float = field(
        default=0.0,
        metadata={
            "help":
            "scalar quantization noise and scalar quantization at training time"
        },
    )
    # config for Fully Sharded Data Parallel (FSDP) training
    min_params_to_wrap: int = field(
        default=DEFAULT_MIN_PARAMS_TO_WRAP,
        metadata={
            "help":
            ("minimum number of params for a layer to be wrapped with FSDP() when "
             "training with --ddp-backend=fully_sharded. Smaller values will "
             "improve memory efficiency, but may make torch.distributed "
             "communication less efficient due to smaller input sizes. This option "
             "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
             "--offload-activations are passed.")
        },
    )
    # config for "BASE Layers: Simplifying Training of Large, Sparse Models"
    base_layers: Optional[int] = field(
        default=0, metadata={"help": "number of BASE layers in total"})
    base_sublayers: Optional[int] = field(
        default=1, metadata={"help": "number of sublayers in each BASE layer"})
    base_shuffle: Optional[int] = field(
        default=1,
        metadata={
            "help":
            "shuffle tokens between workers before computing assignment"
        },
    )
    # NormFormer
    scale_fc: Optional[bool] = field(
        default=False,
        metadata={"help": "Insert LayerNorm between fully connected layers"},
    )
    scale_attn: Optional[bool] = field(
        default=False, metadata={"help": "Insert LayerNorm after attention"})
    scale_heads: Optional[bool] = field(
        default=False,
        metadata={"help": "Learn a scale coefficient for each attention head"},
    )
    scale_resids: Optional[bool] = field(
        default=False,
        metadata={
            "help": "Learn a scale coefficient for each residual connection"
        },
    )

    # xFormers arguments
    decoder_xformers_att_config: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "config for xFormers library attention, defined in xformers.components.attention.AttentionConfig",
        },
    )

    # options from other parts of the config
    add_bos_token: bool = II("task.add_bos_token")
    tokens_per_sample: int = II("task.tokens_per_sample")
    max_target_positions: Optional[int] = II("task.max_target_positions")
    tpu: bool = II("common.tpu")
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # encoder 1: S2TTransformerEncoder for speech
        parser.add_argument(
            "--conv-kernel-sizes",
            type=str,
            metavar="N",
            help="kernel sizes of Conv1d subsampling layers",
        )
        parser.add_argument(
            "--conv-channels",
            type=int,
            metavar="N",
            help="# of channels in Conv1d subsampling layers",
        )
        parser.add_argument(
            "--enc-output-dim",
            type=int,
            metavar="N",
            help="""
                encoder output dimension, can be None. If specified, projecting the
                transformer output to the specified dimension""",
        )
        # standard Transformer
        parser.add_argument(
            "--activation-fn",
            type=str,
            default="relu",
            choices=utils.get_available_activation_fns(),
            help="activation function to use",
        )
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--activation-dropout",
            "--relu-dropout",
            type=float,
            metavar="D",
            help="dropout probability after activation in FFN.",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-text-embed-dim",
            type=int,
            metavar="N",
            help="encoder text embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension for FFN",
        )
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="num decoder layers")
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads",
        )
        parser.add_argument(
            "--layernorm-embedding",
            action="store_true",
            help="add layernorm to embedding",
        )
        parser.add_argument(
            "--no-scale-embedding",
            action="store_true",
            help="if True, dont scale embeddings",
        )
        # non-standard transformer parameters
        parser.add_argument(
            "--speech-encoder-layers",
            type=int,
            metavar="N",
            help="num speech encoder layers",
        )
        parser.add_argument(
            "--text-encoder-layers",
            type=int,
            metavar="N",
            help="num text encoder layers",
        )
        parser.add_argument(
            "--encoder-shared-layers",
            type=int,
            metavar="N",
            help="num shared encoder layers",
        )
        parser.add_argument(
            "--encoder-shared-layer-level",
            type=int,
            metavar="N",
            default=0,
            choices=[0, 1, 2],
            help=
            "share layer level 0: all share 1: all share with separate model 2: share weight but not bias and layernorm",
        )

        parser.add_argument(
            "--decoder-shared-layer-level",
            default=0,
            choices=[0, 1, 2],
            type=int,
            metavar="N",
            help=
            "0: share everything; 1: share everything with different model 2: no share layer_norm and bias",
        )
        ###
        parser.add_argument(
            "--text-input-cost-ratio",
            type=float,
            default=1.0,
            metavar="V",
            help="text input cost ratio relative to speech input cost",
        )
        parser.add_argument(
            "--init-scale",
            type=float,
            default=1.0,
            metavar="V",
            help="scale the initial weight by given factor",
        )
        parser.add_argument(
            "--enc-grad-mult",
            type=float,
            metavar="V",
            default=1.0,
            help="multiply enc1 and enc2 gradient by V",
        )
        parser.add_argument(
            "--enc2-along-grad-mult",
            type=float,
            metavar="V",
            default=1.0,
            help="multiply enc2 gradient by V if only enc2 is used",
        )
        parser.add_argument(
            "--load-pretrain-encoder",
            type=str,
            default="",
            metavar="EXPR",
            help=""" path to the pretrained encoder """,
        )
        parser.add_argument(
            "--load-pretrain-speech-encoder",
            type=str,
            default="",
            metavar="EXPR",
            help=""" path to the pretrained speech encoder """,
        )
        parser.add_argument(
            "--load-pretrain-text-encoder",
            type=str,
            default="",
            metavar="EXPR",
            help=""" path to the pretrained text encoder """,
        )
        parser.add_argument(
            "--load-pretrain-text-encoder-last",
            type=str,
            default="",
            metavar="EXPR",
            help=""" path to the pretrained text encoder """,
        )
        parser.add_argument(
            "--load-pretrain-decoder",
            type=str,
            metavar="EXPR",
            default="",
            help=""" path to the pretrained encoder """,
        )
        parser.add_argument(
            "--add-speech-eos",
            action="store_true",
            help="add eos token at the end of input feature",
        )
        parser.add_argument(
            "--speech-encoder-adapter-type",
            type=str,
            metavar="EXPR",
            default="None",
            choices=["None", "Linear", "MLP"],
            help="add speech encoder adapter",
        )
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # Arguments related to dropout
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for"
            " attention weights",
        )
        parser.add_argument(
            "--act-dropout",
            type=float,
            metavar="D",
            help="dropout probability after"
            " activation in FFN",
        )

        # Arguments related to hidden states and self-attention
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="num encoder layers")
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument("--bias-kv",
                            action="store_true",
                            help="if set, adding a learnable bias kv")
        parser.add_argument("--zero-attn",
                            action="store_true",
                            help="if set, pads attn with zero")

        # Arguments related to input and output embeddings
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--share-encoder-input-output-embed",
            action="store_true",
            help="share encoder input"
            " and output embeddings",
        )
        parser.add_argument(
            "--encoder-learned-pos",
            action="store_true",
            help="use learned positional embeddings in the encoder",
        )
        parser.add_argument(
            "--no-token-positional-embeddings",
            action="store_true",
            help="if set, disables positional embeddings"
            " (outside self attention)",
        )
        parser.add_argument("--num-segment",
                            type=int,
                            metavar="N",
                            help="num segment in the input")

        # Arguments related to sentence level prediction
        parser.add_argument(
            "--sentence-class-num",
            type=int,
            metavar="N",
            help="number of classes for sentence task",
        )
        parser.add_argument(
            "--sent-loss",
            action="store_true",
            help="if set,"
            " calculate sentence level predictions",
        )

        # Arguments related to parameter initialization
        parser.add_argument(
            "--apply-bert-init",
            action="store_true",
            help="use custom param initialization for BERT",
        )

        # misc params
        parser.add_argument(
            "--activation-fn",
            choices=utils.get_available_activation_fns(),
            help="activation function to use",
        )
        parser.add_argument(
            "--pooler-activation-fn",
            choices=utils.get_available_activation_fns(),
            help="Which activation function to use for pooler layer.",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            action="store_true",
            help="apply layernorm before each encoder block",
        )
Exemple #19
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off  # TODO
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--decoder-output-dim',
         type=int,
         metavar='N',
         help='decoder output dimension (extra linear layer '
         'if different from decoder embed dim')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--layernorm-embedding',
                         action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding',
                         action='store_true',
                         help='if True, dont scale embeddings')
     parser.add_argument(
         '--checkpoint-activations',
         action='store_true',
         help='checkpoint activations at each layer, which saves GPU '
         'memory usage at the cost of some additional compute')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention',
                         default=False,
                         action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention',
                         default=False,
                         action='store_true',
                         help='perform cross+self-attention')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     parser.add_argument(
         '--decoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         '--quant-noise-pq',
         type=float,
         metavar='D',
         default=0,
         help='iterative PQ quantization noise at training time')
     parser.add_argument(
         '--quant-noise-pq-block-size',
         type=int,
         metavar='D',
         default=8,
         help='block size of quantization noise at training time')
     parser.add_argument(
         '--quant-noise-scalar',
         type=float,
         metavar='D',
         default=0,
         help=
         'scalar quantization noise and scalar quantization at training time'
     )
     # for prime
     parser.add_argument('--use_att',
                         type=str,
                         nargs='+',
                         default=[
                             'es',
                             'ds',
                             'dc',
                         ],
                         help='')
     parser.add_argument('--kernel_size',
                         type=int,
                         default=0,
                         help='do not set static kernel')
     parser.add_argument(
         '--attn_dynamic_type',
         type=int,
         default=0,
         help=
         '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use  dynamic kernel '
     )
     parser.add_argument('--attn_cat_relu', type=int, default=0)
     parser.add_argument(
         '--attn_wide_kernels',
         type=lambda x: options.eval_str_list(x, int),
         help='list of kernel size (default: "[3,15]") for wide and gate')
     parser.add_argument('--weight-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for conv weights')
     parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1')
     parser.add_argument(
         '--dynamic_depth_kernels',
         type=lambda x: options.eval_str_list(x, int),
         help=
         'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn'
     )
     parser.add_argument('--dynamic_padding',
                         type=int,
                         default=0,
                         help='padding before dynamic conv')
     parser.add_argument('--attn_dynamic_cat', type=int, default=1)
     parser.add_argument('--input_dropout', type=float, default=0, help='')
     parser.add_argument('--init_method',
                         type=str,
                         default='km',
                         help='xavier,km,xi,fixup')
     parser.add_argument('--lnv',
                         type=str,
                         default='origin',
                         help='layernorm,adanorm')
Exemple #20
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='L',
                         help='num encoder layers')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='H',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='F',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='A',
                         help='num encoder attention heads')
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--pooler-activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use for pooler layer')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after activation in FFN')
     parser.add_argument(
         '--pooler-dropout',
         type=float,
         metavar='D',
         help='dropout probability in the masked_lm pooler layers')
     parser.add_argument('--max-positions',
                         type=int,
                         help='number of positional embeddings to learn')
     parser.add_argument(
         '--load-checkpoint-heads',
         action='store_true',
         help='(re-)register and load heads when loading checkpoints')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         '--quant-noise-pq',
         type=float,
         metavar='D',
         default=0,
         help='iterative PQ quantization noise at training time')
     parser.add_argument(
         '--quant-noise-pq-block-size',
         type=int,
         metavar='D',
         default=8,
         help='block size of quantization noise at training time')
     parser.add_argument(
         '--quant-noise-scalar',
         type=float,
         metavar='D',
         default=0,
         help=
         'scalar quantization noise and scalar quantization at training time'
     )
     parser.add_argument(
         '--untie-weights-roberta',
         action='store_true',
         help='Untie weights between embeddings and classifiers in RoBERTa')
    def add_args(parser):
        """Add model-specific arguments to the parser."""

        parser.add_argument(
            "--extractor-mode",
            choices=["default", "layer_norm"],
            help=
            "mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with --normalize)",
        )

        parser.add_argument(
            "--encoder-layers",
            type=int,
            metavar="L",
            help="num encoder layers in the transformer",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="H",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="F",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="A",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--activation-fn",
            choices=utils.get_available_activation_fns(),
            help="activation function to use",
        )

        parser.add_argument(
            "--dropout",
            type=float,
            metavar="D",
            help="dropout probability for the transformer",
        )

        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )

        parser.add_argument(
            "--activation-dropout",
            type=float,
            metavar="D",
            help="dropout probability after activation in FFN",
        )

        parser.add_argument(
            "--final-dim",
            type=int,
            metavar="D",
            help=
            "project final representations and targets to this many dimensions",
        )

        parser.add_argument(
            "--layer-norm-first",
            action="store_true",
            help="apply layernorm first in the transformer",
        )

        parser.add_argument(
            "--encoder-layerdrop",
            type=float,
            help="probability of dropping a tarnsformer layer",
        )

        parser.add_argument(
            "--conv-feature-layers",
            type=str,
            metavar="EXPR",
            help=
            "convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
        )

        parser.add_argument("--logit-temp",
                            type=float,
                            help="temperature to divide logits by")

        parser.add_argument("--quantize-targets",
                            action="store_true",
                            help="use quantized targets")

        parser.add_argument("--quantize-input",
                            action="store_true",
                            help="use quantized inputs")

        parser.add_argument(
            "--same-quantizer",
            action="store_true",
            help="use same quantizer for inputs and targets",
        )

        parser.add_argument(
            "--feature-grad-mult",
            type=float,
            help="multiply feature extractor var grads by this",
        )

        parser.add_argument(
            "--latent-vars",
            type=int,
            metavar="N",
            help="number of latent variables V in each group of the codebook",
        )

        parser.add_argument(
            "--latent-groups",
            type=int,
            metavar="N",
            help="number of groups G of latent variables in the codebook",
        )

        parser.add_argument(
            "--latent-dim",
            type=int,
            metavar="N",
            help=
            "if set, uses this dimensionality for latent variables. otherwise uses final_dim / latent_groups",
        )

        parser.add_argument("--mask-length", type=int, help="mask length")

        parser.add_argument("--mask-prob",
                            type=float,
                            help="probability of replacing a token with mask")

        parser.add_argument(
            "--mask-selection",
            type=str,
            choices=["static", "uniform", "normal", "poisson"],
            help="how to choose masks",
        )

        parser.add_argument(
            "--mask-other",
            type=float,
            help=
            "secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
        )

        parser.add_argument(
            "--no-mask-overlap",
            action="store_true",
            help="whether to allow masks to overlap",
        )

        parser.add_argument(
            "--mask-min-space",
            type=int,
            help="min space between spans (if no overlap is enabled)",
        )

        parser.add_argument(
            "--mask-channel-length",
            type=int,
            help="repeat the mask indices multiple times",
        )

        parser.add_argument(
            "--mask-channel-prob",
            type=float,
            help="probability of replacing a token with mask",
        )

        parser.add_argument(
            "--mask-channel-selection",
            type=str,
            choices=["static", "uniform", "normal", "poisson"],
            help="how to choose masks",
        )

        parser.add_argument(
            "--mask-channel-other",
            type=float,
            help=
            "secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
        )

        parser.add_argument(
            "--no-mask-channel-overlap",
            action="store_true",
            help="whether to allow masks to overlap",
        )

        parser.add_argument(
            "--mask-channel-min-space",
            type=int,
            help="min space between spans (if no overlap is enabled)",
        )

        parser.add_argument(
            "--dropout-input",
            type=float,
            metavar="D",
            help="dropout to apply to the input (after feat extr)",
        )

        parser.add_argument(
            "--dropout-features",
            type=float,
            metavar="D",
            help="dropout to apply to the features (after feat extr)",
        )

        parser.add_argument("--num-negatives",
                            type=int,
                            metavar="N",
                            help="number of negative examples")

        parser.add_argument(
            "--negatives-from-everywhere",
            action="store_true",
            help="sample negatives from everywhere, not just masked states",
        )

        parser.add_argument(
            "--cross-sample-negatives",
            type=int,
            metavar="N",
            help="num of cross sampled negatives",
        )

        parser.add_argument(
            "--codebook-negatives",
            type=int,
            metavar="N",
            help="num of codebook sampled negatives",
        )

        parser.add_argument(
            "--conv-pos",
            type=int,
            metavar="N",
            help="number of filters for convolutional positional embeddings",
        )

        parser.add_argument(
            "--conv-pos-groups",
            type=int,
            metavar="N",
            help="number of groups for convolutional positional embedding",
        )

        parser.add_argument(
            "--latent-temp",
            type=str,
            metavar="D",
            help=
            "temperature for latent variable sampling. can be tuple of 3 values (start, end, decay)",
        )

        parser.add_argument("--target-glu",
                            action="store_true",
                            help="adds projection + glu to targets")

        parser.add_argument("--conv-bias",
                            action="store_true",
                            help="include bias in conv encoder")
from fairseq import utils
from fairseq.dataclass import ChoiceEnum, FairseqDataclass
from fairseq.models import (
    BaseFairseqModel,
    register_model,
)

from fairseq.models.roberta.model import RobertaClassificationHead

from fairseq.modules import (
    LayerNorm,
    TransformerSentenceEncoder,
    TransformerSentenceEncoderLayer,
)

ACTIVATION_FN_CHOICES = ChoiceEnum(utils.get_available_activation_fns())
JOINT_CLASSIFICATION_CHOICES = ChoiceEnum(["none", "sent"])
SENTENCE_REP_CHOICES = ChoiceEnum(["head", "meanpool", "maxpool"])


def update_init_roberta_model_state(state):
    """
   update the state_dict of a Roberta model for initializing
   weights of the BertRanker
   """
    for k in list(state.keys()):
        if ".lm_head." in k or "version" in k:
            del state[k]
            continue
        # remove 'encoder/decoder.sentence_encoder.' from the key
        assert k.startswith("encoder.sentence_encoder.") or k.startswith(
Exemple #23
0
class TransformerLanguageModelConfig(FairseqDataclass):
    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
        default="relu", metadata={"help": "activation function to use"}
    )
    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
    attention_dropout: float = field(
        default=0.0, metadata={"help": "dropout probability for attention weights"}
    )
    activation_dropout: float = field(
        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
    )
    relu_dropout: float = field(
        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
    )
    decoder_embed_dim: int = field(
        default=512, metadata={"help": "decoder embedding dimension"}
    )
    decoder_output_dim: int = field(
        default=512, metadata={"help": "decoder output dimension"}
    )
    decoder_input_dim: int = field(
        default=512, metadata={"help": "decoder input dimension"}
    )
    decoder_ffn_embed_dim: int = field(
        default=2048, metadata={"help": "decoder embedding dimension for FFN"}
    )
    decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"})
    decoder_attention_heads: int = field(
        default=8, metadata={"help": "num decoder attention heads"}
    )
    decoder_normalize_before: bool = field(
        default=False, metadata={"help": "apply layernorm before each decoder block"}
    )
    no_decoder_final_norm: bool = field(
        default=False,
        metadata={"help": "don't add an extra layernorm after the last decoder block"},
    )
    adaptive_softmax_cutoff: Optional[str] = field(
        default=None,
        metadata={
            "help": "comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion"
        },
    )
    adaptive_softmax_dropout: float = field(
        default=0,
        metadata={"help": "sets adaptive softmax dropout for the tail projections"},
    )
    adaptive_softmax_factor: float = field(
        default=4, metadata={"help": "adaptive input factor"}
    )
    no_token_positional_embeddings: bool = field(
        default=False,
        metadata={
            "help": "if set, disables positional embeddings (outside self attention)"
        },
    )
    share_decoder_input_output_embed: bool = field(
        default=False, metadata={"help": "share decoder input and output embeddings"}
    )
    character_embeddings: bool = field(
        default=False,
        metadata={
            "help": "if set, uses character embedding convolutions to produce token embeddings"
        },
    )
    character_filters: str = field(
        default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
        metadata={"help": "size of character embeddings"},
    )
    character_embedding_dim: int = field(
        default=4, metadata={"help": "size of character embeddings"}
    )
    char_embedder_highway_layers: int = field(
        default=2,
        metadata={"help": "number of highway layers for character token embeddder"},
    )
    adaptive_input: bool = field(
        default=False, metadata={"help": "if set, uses adaptive input"}
    )
    adaptive_input_factor: float = field(
        default=4, metadata={"help": "adaptive input factor"}
    )
    adaptive_input_cutoff: Optional[str] = field(
        default=None,
        metadata={"help": "comma separated list of adaptive input cutoff points."},
    )
    tie_adaptive_weights: bool = field(
        default=False,
        metadata={
            "help": "if set, ties the weights of adaptive softmax and adaptive input"
        },
    )
    tie_adaptive_proj: bool = field(
        default=False,
        metadata={
            "help": "if set, ties the projection weights of adaptive softmax and adaptive input"
        },
    )
    decoder_learned_pos: bool = field(
        default=False,
        metadata={"help": "use learned positional embeddings in the decoder"},
    )
    decoder_layerdrop: float = field(
        default=0.0, metadata={"help": "LayerDrop probability for decoder"}
    )
    decoder_layers_to_keep: Optional[str] = field(
        default=None,
        metadata={
            "help": "which layers to *keep* when pruning as a comma-separated list"
        },
    )
    layernorm_embedding: bool = field(
        default=False, metadata={"help": "add layernorm to embedding"}
    )
    no_scale_embedding: bool = field(
        default=False, metadata={"help": "if True, dont scale embeddings"}
    )
    checkpoint_activations: bool = field(
        default=False, metadata={"help": "checkpoint activations at each layer"}
    )
    quant_noise_pq: float = field(
        default=0.0,
        metadata={"help": "iterative PQ quantization noise at training time"},
    )
    quant_noise_pq_block_size: int = field(
        default=8,
        metadata={"help": "block size of quantization noise at training time"},
    )
    # TODO common var add to parent
    quant_noise_scalar: float = field(
        default=0.0,
        metadata={
            "help": "scalar quantization noise and scalar quantization at training time"
        },
    )
    add_bos_token: bool = II("task.add_bos_token")
    tokens_per_sample: int = II("task.tokens_per_sample")
    max_target_positions: Optional[int] = II("task.max_target_positions")
    tpu: bool = II("common.tpu")
Exemple #24
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument("--encoder-layers",
                         type=int,
                         metavar="L",
                         help="num encoder layers")
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="H",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="F",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="A",
         help="num encoder attention heads",
     )
     parser.add_argument(
         "--activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use",
     )
     parser.add_argument(
         "--pooler-activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use for pooler layer",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument("--dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability")
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--activation-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN",
     )
     parser.add_argument(
         "--pooler-dropout",
         type=float,
         metavar="D",
         help="dropout probability in the masked_lm pooler layers",
     )
     parser.add_argument("--max-positions",
                         type=int,
                         help="number of positional embeddings to learn")
     parser.add_argument(
         "--load-checkpoint-heads",
         action="store_true",
         help="(re-)register and load heads when loading checkpoints",
     )
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument(
         "--encoder-layerdrop",
         type=float,
         metavar="D",
         default=0,
         help="LayerDrop probability for encoder",
     )
     parser.add_argument(
         "--encoder-layers-to-keep",
         default=None,
         help=
         "which layers to *keep* when pruning as a comma-separated list",
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         "--quant-noise-pq",
         type=float,
         metavar="D",
         default=0,
         help="iterative PQ quantization noise at training time",
     )
     parser.add_argument(
         "--quant-noise-pq-block-size",
         type=int,
         metavar="D",
         default=8,
         help="block size of quantization noise at training time",
     )
     parser.add_argument(
         "--quant-noise-scalar",
         type=float,
         metavar="D",
         default=0,
         help=
         "scalar quantization noise and scalar quantization at training time",
     )
     parser.add_argument(
         "--untie-weights-roberta",
         action="store_true",
         help="Untie weights between embeddings and classifiers in RoBERTa",
     )
     parser.add_argument(
         "--spectral-norm-classification-head",
         action="store_true",
         default=False,
         help="Apply spectral normalization on the classification head",
     )
Exemple #25
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument(
         "--encoder-layers", type=int, metavar="L", help="num encoder layers"
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="H",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="F",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="A",
         help="num encoder attention heads",
     )
     parser.add_argument(
         "--activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use",
     )
     parser.add_argument(
         "--pooler-activation-fn",
         choices=utils.get_available_activation_fns(),
         help="activation function to use for pooler layer",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--layernorm-embedding",
         action="store_true",
         help="add layernorm to embedding",
     )
     parser.add_argument(
         "--dropout", type=float, metavar="D", help="dropout probability"
     )
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--activation-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN",
     )
     parser.add_argument(
         "--pooler-dropout",
         type=float,
         metavar="D",
         help="dropout probability in the masked_lm pooler layers",
     )
     parser.add_argument(
         "--max-positions", type=int, help="number of positional embeddings to learn"
     )
     parser.add_argument(
         "--load-checkpoint-heads",
         action="store_true",
         help="(re-)register and load heads when loading checkpoints",
     )
     parser.add_argument(
         "--untie-weights-roberta",
         action="store_true",
         help="Untie weights between embeddings and classifiers in RoBERTa",
     )
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument(
         "--encoder-layerdrop",
         type=float,
         metavar="D",
         default=0,
         help="LayerDrop probability for encoder",
     )
     parser.add_argument(
         "--encoder-layers-to-keep",
         default=None,
         help="which layers to *keep* when pruning as a comma-separated list",
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         "--quant-noise-pq",
         type=float,
         metavar="D",
         default=0,
         help="iterative PQ quantization noise at training time",
     )
     parser.add_argument(
         "--quant-noise-pq-block-size",
         type=int,
         metavar="D",
         default=8,
         help="block size of quantization noise at training time",
     )
     parser.add_argument(
         "--quant-noise-scalar",
         type=float,
         metavar="D",
         default=0,
         help="scalar quantization noise and scalar quantization at training time",
     )
     # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020)
     parser.add_argument(
         "--spectral-norm-classification-head",
         action="store_true",
         default=False,
         help="Apply spectral normalization on the classification head",
     )
     # args for Fully Sharded Data Parallel (FSDP) training
     parser.add_argument(
         "--min-params-to-wrap",
         type=int,
         metavar="D",
         default=DEFAULT_MIN_PARAMS_TO_WRAP,
         help=(
             "minimum number of params for a layer to be wrapped with FSDP() when "
             "training with --ddp-backend=fully_sharded. Smaller values will "
             "improve memory efficiency, but may make torch.distributed "
             "communication less efficient due to smaller input sizes. This option "
             "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
             "--offload-activations are passed."
         ),
     )
     # args for AdaPruning
     # In short, it adds regularizarion for the multihead attention module and feed forward neural nets
     # For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI
     parser.add_argument(
         "--mha-reg-scale-factor",
         type=float,
         metavar="D",
         default=0.0,
         help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
     )
     parser.add_argument(
         "--ffn-reg-scale-factor",
         type=float,
         metavar="D",
         default=0.0,
         help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
     )
Exemple #26
0
class Wav2Vec2Config(FairseqDataclass):
    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
        default="default",
        metadata={
            "help":
            "mode for feature extractor. default has a single group norm with d "
            "groups in the first conv block, whereas layer_norm has layer norms in "
            "every block (meant to use with normalize=True)"
        },
    )
    encoder_layers: int = field(
        default=12, metadata={"help": "num encoder layers in the transformer"})
    encoder_embed_dim: int = field(
        default=768, metadata={"help": "encoder embedding dimension"})
    encoder_ffn_embed_dim: int = field(
        default=3072, metadata={"help": "encoder embedding dimension for FFN"})
    encoder_attention_heads: int = field(
        default=12, metadata={"help": "num encoder attention heads"})
    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
        default="gelu", metadata={"help": "activation function to use"})

    # dropouts
    dropout: float = field(
        default=0.1,
        metadata={"help": "dropout probability for the transformer"})
    attention_dropout: float = field(
        default=0.1,
        metadata={"help": "dropout probability for attention weights"})
    activation_dropout: float = field(
        default=0.0,
        metadata={"help": "dropout probability after activation in FFN"})
    encoder_layerdrop: float = field(
        default=0.0,
        metadata={"help": "probability of dropping a tarnsformer layer"})
    dropout_input: float = field(
        default=0.0,
        metadata={"help": "dropout to apply to the input (after feat extr)"},
    )
    dropout_features: float = field(
        default=0.0,
        metadata={
            "help": "dropout to apply to the features (after feat extr)"
        },
    )

    final_dim: int = field(
        default=0,
        metadata={
            "help":
            "project final representations and targets to this many dimensions."
            "set to encoder_embed_dim is <= 0"
        },
    )
    layer_norm_first: bool = field(
        default=False,
        metadata={"help": "apply layernorm first in the transformer"})
    conv_feature_layers: str = field(
        default=
        "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
        metadata={
            "help":
            "string describing convolutional feature extraction layers in form of a python list that contains "
            "[(dim, kernel_size, stride), ...]"
        },
    )
    conv_bias: bool = field(default=False,
                            metadata={"help": "include bias in conv encoder"})
    logit_temp: float = field(
        default=0.1, metadata={"help": "temperature to divide logits by"})
    quantize_targets: bool = field(default=False,
                                   metadata={"help": "use quantized targets"})
    quantize_input: bool = field(default=False,
                                 metadata={"help": "use quantized inputs"})
    same_quantizer: bool = field(
        default=False,
        metadata={"help": "use same quantizer for inputs and targets"})
    target_glu: bool = field(
        default=False, metadata={"help": "adds projection + glu to targets"})
    feature_grad_mult: float = field(
        default=1.0,
        metadata={"help": "multiply feature extractor var grads by this"})
    quantizer_depth: int = field(
        default=1,
        metadata={"help": "number of quantizer layers"},
    )
    quantizer_factor: int = field(
        default=3,
        metadata={
            "help":
            "dimensionality increase for inner quantizer layers (if depth > 1)"
        },
    )
    latent_vars: int = field(
        default=320,
        metadata={
            "help":
            "number of latent variables V in each group of the codebook"
        },
    )
    latent_groups: int = field(
        default=2,
        metadata={
            "help": "number of groups G of latent variables in the codebook"
        },
    )
    latent_dim: int = field(
        default=0,
        metadata={
            "help":
            "if > 0, uses this dimensionality for latent variables. "
            "otherwise uses final_dim / latent_groups"
        },
    )

    # masking
    mask_length: int = field(default=10, metadata={"help": "mask length"})
    mask_prob: float = field(
        default=0.65,
        metadata={"help": "probability of replacing a token with mask"})
    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
        default="static", metadata={"help": "how to choose mask length"})
    mask_other: float = field(
        default=0,
        metadata={
            "help":
            "secondary mask argument (used for more complex distributions), "
            "see help in compute_mask_indices"
        },
    )
    no_mask_overlap: bool = field(
        default=False, metadata={"help": "whether to allow masks to overlap"})
    mask_min_space: int = field(
        default=1,
        metadata={
            "help": "min space between spans (if no overlap is enabled)"
        },
    )

    # channel masking
    mask_channel_length: int = field(
        default=10,
        metadata={"help": "length of the mask for features (channels)"})
    mask_channel_prob: float = field(
        default=0.0,
        metadata={"help": "probability of replacing a feature with 0"})
    mask_channel_before: bool = False
    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
        default="static",
        metadata={"help": "how to choose mask length for channel masking"},
    )
    mask_channel_other: float = field(
        default=0,
        metadata={
            "help":
            "secondary mask argument (used for more complex distributions), "
            "see help in compute_mask_indicesh"
        },
    )
    no_mask_channel_overlap: bool = field(
        default=False,
        metadata={"help": "whether to allow channel masks to overlap"})
    mask_channel_min_space: int = field(
        default=1,
        metadata={
            "help": "min space between spans (if no overlap is enabled)"
        },
    )

    # negative selection
    num_negatives: int = field(
        default=100,
        metadata={"help": "number of negative examples from the same sample"},
    )
    negatives_from_everywhere: bool = field(
        default=False,
        metadata={
            "help": "sample negatives from everywhere, not just masked states"
        },
    )
    cross_sample_negatives: int = field(
        default=0,
        metadata={"help": "number of negative examples from the any sample"})
    codebook_negatives: int = field(
        default=0, metadata={"help": "number of negative examples codebook"})

    # positional embeddings
    conv_pos: int = field(
        default=128,
        metadata={
            "help": "number of filters for convolutional positional embeddings"
        },
    )
    conv_pos_groups: int = field(
        default=16,
        metadata={
            "help": "number of groups for convolutional positional embedding"
        },
    )

    latent_temp: Tuple[float, float, float] = field(
        default=(2, 0.5, 0.999995),
        metadata={
            "help":
            "temperature for latent variable sampling. "
            "can be tuple of 3 values (start, end, decay)"
        },
    )

    checkpoint_activations: bool = field(
        default=False,
        metadata={
            "help": "recompute activations and save memory for extra compute"
        },
    )
Exemple #27
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention',
                         default=False,
                         action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention',
                         default=False,
                         action='store_true',
                         help='perform cross+self-attention')
     parser.add_argument(
         '--layer-wise-attention',
         default=False,
         action='store_true',
         help=
         'perform layer-wise attention (cross-attention or cross+self-attention)'
     )
     # adanorm
     parser.add_argument('--lnv',
                         type=str,
                         default='origin',
                         help='origin, no_norm, topk, adanorm,nowb')
     parser.add_argument(
         '--sigma',
         type=float,
         default=0.005,
     )
     parser.add_argument('--adanorm_scale',
                         type=float,
                         default=2.0,
                         help='')
     parser.add_argument('--nowb_scale', type=float, default=1.0, help='')
     parser.add_argument('--mean_detach', type=int, default=0, help='')
     parser.add_argument('--std_detach', type=int, default=0, help='')
     parser.add_argument('--init_method',
                         type=str,
                         default='xavier',
                         help='xavier,km,xi')
     parser.add_argument('--init_topk_rho', type=float, default=0)
     parser.add_argument('--big_km', type=int, default=0)
     parser.add_argument(
         '--big_km_list',
         type=str,
         nargs='+',
         default=['in', 'out', 'fc1', 'fc2', 'qkv', 'attn_out'],
         help='')
Exemple #28
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # Arguments related to dropout
        parser.add_argument('--dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for'
                            ' attention weights')
        parser.add_argument('--act-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability after'
                            ' activation in FFN')

        # Arguments related to hidden states and self-attention
        parser.add_argument('--encoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers',
                            type=int,
                            metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num encoder attention heads')
        parser.add_argument('--bias-kv',
                            action='store_true',
                            help='if set, adding a learnable bias kv')
        parser.add_argument('--zero-attn',
                            action='store_true',
                            help='if set, pads attn with zero')

        # Arguments related to input and output embeddings
        parser.add_argument('--encoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--share-encoder-input-output-embed',
                            action='store_true',
                            help='share encoder input'
                            ' and output embeddings')
        parser.add_argument(
            '--encoder-learned-pos',
            action='store_true',
            help='use learned positional embeddings in the encoder')
        parser.add_argument('--no-token-positional-embeddings',
                            action='store_true',
                            help='if set, disables positional embeddings'
                            ' (outside self attention)')
        parser.add_argument('--num-segment',
                            type=int,
                            metavar='N',
                            help='num segment in the input')

        # Arguments related to sentence level prediction
        parser.add_argument('--sentence-class-num',
                            type=int,
                            metavar='N',
                            help='number of classes for sentence task')
        parser.add_argument('--sent-loss',
                            action='store_true',
                            help='if set,'
                            ' calculate sentence level predictions')

        # Arguments related to parameter initialization
        parser.add_argument('--apply-bert-init',
                            action='store_true',
                            help='use custom param initialization for BERT')

        # misc params
        parser.add_argument('--activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='activation function to use')
        parser.add_argument(
            '--pooler-activation-fn',
            choices=utils.get_available_activation_fns(),
            help='Which activation function to use for pooler layer.')
        parser.add_argument('--encoder-normalize-before',
                            action='store_true',
                            help='apply layernorm before each encoder block')
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout', type=float, metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout', type=float, metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers', type=int, metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before', action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--encoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers', type=int, metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before', action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings', action='store_true',
                         help='share encoder, decoder and output embeddings'
                              ' (requires shared dictionary and embed dim)')
     parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                         help='if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                         help='comma separated list of adaptive softmax cutoff points. '
                              'Must be used with adaptive_loss criterion'),
     parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                         help='sets adaptive softmax dropout for the tail projections')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention', default=False, action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention', default=False, action='store_true',
                         help='perform cross+self-attention')
     parser.add_argument('--layer-wise-attention', default=False, action='store_true',
                         help='perform layer-wise attention (cross-attention or cross+self-attention)')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument('--encoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     parser.add_argument('--decoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     parser.add_argument('--layernorm-embedding', action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding', action='store_true',
                         help='if True, dont scale embeddings')
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument("--activation-fn",
                         choices=utils.get_available_activation_fns(),
                         help="activation function to use")
     parser.add_argument("--dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability")
     parser.add_argument("--encoder-conv-channels",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's out channels")
     parser.add_argument("--encoder-conv-kernel-sizes",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's kernel sizes")
     parser.add_argument("--encoder-conv-strides",
                         type=str,
                         metavar="EXPR",
                         help="list of encoder convolution's strides")
     parser.add_argument("--attention-dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability for attention weights")
     parser.add_argument(
         "--activation-dropout",
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after activation in FFN.")
     parser.add_argument("--encoder-ffn-embed-dim",
                         type=int,
                         metavar="N",
                         help="encoder embedding dimension for FFN")
     parser.add_argument("--encoder-layers",
                         type=int,
                         metavar="N",
                         help="num encoder layers")
     parser.add_argument("--encoder-attention-heads",
                         type=int,
                         metavar="N",
                         help="num encoder attention heads")
     parser.add_argument("--encoder-normalize-before",
                         action="store_true",
                         help="apply layernorm before each encoder block")
     parser.add_argument(
         "--encoder-transformer-context",
         type=str,
         metavar="EXPR",
         help="left/right context for time-restricted self-attention; "
         "can be None or a tuple of two non-negative integers/None")
     parser.add_argument(
         "--no-token-positional-embeddings",
         action="store_true",
         help=
         "if set, disables positional embeddings (outside self attention)")
     parser.add_argument("--layernorm-embedding",
                         action="store_true",
                         help="add layernorm to embedding")
     parser.add_argument(
         "--checkpoint-activations",
         action="store_true",
         help="checkpoint activations at each layer, which saves GPU "
         "memory usage at the cost of some additional compute")
     parser.add_argument(
         "--offload-activations",
         action="store_true",
         help=
         "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations."
     )
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument("--encoder-layerdrop",
                         type=float,
                         metavar="D",
                         default=0,
                         help="LayerDrop probability for encoder")
     parser.add_argument(
         "--encoder-layers-to-keep",
         default=None,
         help="which layers to *keep* when pruning as a comma-separated list"
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         "--quant-noise-pq",
         type=float,
         metavar="D",
         default=0,
         help="iterative PQ quantization noise at training time")
     parser.add_argument(
         "--quant-noise-pq-block-size",
         type=int,
         metavar="D",
         default=8,
         help="block size of quantization noise at training time")
     parser.add_argument(
         "--quant-noise-scalar",
         type=float,
         metavar="D",
         default=0,
         help=
         "scalar quantization noise and scalar quantization at training time"
     )
     # args for Fully Sharded Data Parallel (FSDP) training
     parser.add_argument(
         "--min-params-to-wrap",
         type=int,
         metavar="D",
         default=DEFAULT_MIN_PARAMS_TO_WRAP,
         help=
         ("minimum number of params for a layer to be wrapped with FSDP() when "
          "training with --ddp-backend=fully_sharded. Smaller values will "
          "improve memory efficiency, but may make torch.distributed "
          "communication less efficient due to smaller input sizes. This option "
          "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
          "--offload-activations are passed."))