Esempio n. 1
0
 def build_model(cls, args, task):
     encoder, decoder = cls.build_model_base(args, task)
     return PipelineParallelTransformerModel(
         encoder=encoder,
         decoder=decoder,
         balance=utils.eval_str_list(args.pipeline_balance, type=int),
         devices=utils.eval_str_list(args.pipeline_devices, type=int),
         chunks=args.pipeline_chunks,
         checkpoint=args.pipeline_checkpoint,
     )
Esempio n. 2
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        if getattr(args, "max_target_positions", None) is not None:
            max_target_positions = args.max_target_positions
        else:
            max_target_positions = getattr(args, "tokens_per_sample",
                                           DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path, task.target_dictionary,
                args.decoder_embed_dim)

        if args.share_decoder_input_output_embed:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError(
                    "--share-decoder-input-output-embeddings requires a joint dictionary"
                )

            if args.decoder_embed_dim != args.decoder_out_embed_dim:
                raise ValueError(
                    "--share-decoder-input-output-embeddings requires "
                    "--decoder-embed-dim to match --decoder-out-embed-dim")

        decoder = LSTMDecoder(
            dictionary=task.dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=
            False,  # decoder-only language model doesn't support attention
            encoder_output_units=0,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
            residuals=args.residuals,
        )

        return cls(decoder)
Esempio n. 3
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if hasattr(args, "max_target_positions") and not hasattr(
                args, "tokens_per_sample"):
            args.tokens_per_sample = args.max_target_positions

        decoder = FConvDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.tokens_per_sample,
            share_embed=False,
            positional_embeddings=False,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            adaptive_softmax_dropout=args.adaptive_softmax_dropout,
        )
        return FConvLanguageModel(decoder)
Esempio n. 4
0
def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig):
    from fairseq_stchde import utils

    balance_exists = (
        cfg.pipeline_balance is not None
        or cfg.pipeline_encoder_balance is not None
        or cfg.pipeline_decoder_balance is not None
    )
    devices_exist = (
        cfg.pipeline_devices is not None
        or cfg.pipeline_encoder_devices is not None
        or cfg.pipeline_decoder_devices is not None
    )
    if not balance_exists:
        raise ValueError(
            "--pipeline-balance is currently required for pipeline model parallelism"
        )
    if not devices_exist:
        raise ValueError(
            "--pipeline-devices is currently required for pipeline model parallelism"
        )

    cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int)
    if cfg.pipeline_devices is not None:
        cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int)
        num_pipeline_devices = len(set(cfg.pipeline_devices))
    else:
        cfg.pipeline_encoder_devices = utils.eval_str_list(
            cfg.pipeline_encoder_devices, type=int
        )
        cfg.pipeline_decoder_devices = utils.eval_str_list(
            cfg.pipeline_decoder_devices, type=int
        )
        num_pipeline_devices = len(
            set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)
        )
    gpus_per_node = torch.cuda.device_count()
    assert (
        gpus_per_node >= num_pipeline_devices
        and gpus_per_node % num_pipeline_devices == 0
    ), (
        "the number of unique device IDs in --pipeline-devices must evenly divide "
        "the number of GPUs per node (multi-node pipelining is not yet supported)"
    )
    num_pipelines_per_node = gpus_per_node // num_pipeline_devices
    return num_pipeline_devices, num_pipelines_per_node
Esempio n. 5
0
 def __init__(
     self,
     args,
     dictionary,
     embed_tokens,
     no_encoder_attn=False,
     decoder_module_list=None,
 ):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     import_pipe()
     self.use_pipeline = decoder_module_list is not None
     if not self.use_pipeline:
         self.embedding_layer = TransformerDecoderEmbedding(
             args, embed_tokens)
         self.decoder_layers = nn.Sequential(*[
             TransformerDecoderLayer(args, no_encoder_attn)
             for _ in range(args.decoder_layers)
         ])
         self.decoder_output_layer = TransformerDecoderOutputLayer(
             args, embed_tokens, dictionary)
     else:
         decoder_balance = utils.eval_str_list(
             args.pipeline_decoder_balance, type=int)
         decoder_devices = utils.eval_str_list(
             args.pipeline_decoder_devices, type=int)
         assert sum(decoder_balance) == len(decoder_module_list), (
             f"Sum of decoder_balance={decoder_balance} is not equal " +
             f"to num_decoder_modules={len(decoder_module_list)}")
         if TORCH_PIPE:
             self.model = Pipe(
                 module=partition_model(nn.Sequential(*decoder_module_list),
                                        decoder_balance, decoder_devices),
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
         else:
             self.model = Pipe(
                 module=nn.Sequential(*decoder_module_list),
                 balance=decoder_balance,
                 devices=decoder_devices,
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
Esempio n. 6
0
 def __init__(self,
              args,
              dictionary,
              embed_tokens,
              encoder_module_list=None):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     import_pipe()
     self.use_pipeline = encoder_module_list is not None
     if not self.use_pipeline:
         self.embedding_layer = TransformerEncoderEmbedding(
             args, embed_tokens)
         self.encoder_layers = nn.Sequential(*[
             TransformerEncoderLayer(args)
             for i in range(args.encoder_layers)
         ])
         if isinstance(embed_tokens, nn.ModuleList):
             emb_dim = sum(e.embedding_dim for e in embed_tokens)
         else:
             emb_dim = embed_tokens.embedding_dim
         self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
     else:
         encoder_balance = utils.eval_str_list(
             args.pipeline_encoder_balance, type=int)
         encoder_devices = utils.eval_str_list(
             args.pipeline_encoder_devices, type=int)
         assert sum(encoder_balance) == len(encoder_module_list), (
             f"Sum of encoder_balance={encoder_balance} is not equal " +
             f"to num_encoder_modules={len(encoder_module_list)}")
         if TORCH_PIPE:
             self.model = Pipe(
                 module=partition_model(nn.Sequential(*encoder_module_list),
                                        encoder_balance, encoder_devices),
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
         else:
             self.model = Pipe(
                 module=nn.Sequential(*encoder_module_list),
                 balance=encoder_balance,
                 devices=encoder_devices,
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
Esempio n. 7
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if getattr(args, "max_source_positions", None) is None:
            args.max_source_positions = args.tokens_per_sample
        if getattr(args, "max_target_positions", None) is None:
            args.max_target_positions = args.tokens_per_sample

        if args.character_embeddings:
            embed_tokens = CharacterTokenEmbedder(
                task.dictionary,
                eval(args.character_filters),
                args.character_embedding_dim,
                args.decoder_embed_dim,
                args.char_embedder_highway_layers,
            )
        elif args.adaptive_input:
            embed_tokens = AdaptiveInput(
                len(task.dictionary),
                task.dictionary.pad(),
                args.decoder_input_dim,
                args.adaptive_input_factor,
                args.decoder_embed_dim,
                utils.eval_str_list(args.adaptive_input_cutoff, type=int),
            )
        else:
            embed_tokens = Embedding(len(task.dictionary),
                                     args.decoder_input_dim,
                                     task.dictionary.pad())

        if args.tie_adaptive_weights:
            assert args.adaptive_input
            assert args.adaptive_input_factor == args.adaptive_softmax_factor
            assert (args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
                    ), "{} != {}".format(args.adaptive_softmax_cutoff,
                                         args.adaptive_input_cutoff)
            assert args.decoder_input_dim == args.decoder_output_dim

        decoder = LightConvDecoder(
            args,
            task.output_dictionary,
            embed_tokens,
            no_encoder_attn=True,
            final_norm=False,
        )
        return LightConvLanguageModel(decoder)
Esempio n. 8
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument(
         "--dropout",
         default=0.1,
         type=float,
         metavar="D",
         help="dropout probability",
     )
     parser.add_argument(
         "--attention-dropout",
         default=0.0,
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--relu-dropout",
         default=0.0,
         type=float,
         metavar="D",
         help="dropout probability after ReLU in FFN",
     )
     parser.add_argument(
         "--input-dropout",
         type=float,
         metavar="D",
         help="dropout probability of the inputs",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-output-dim",
         type=int,
         metavar="N",
         help="decoder output dimension",
     )
     parser.add_argument("--decoder-input-dim",
                         type=int,
                         metavar="N",
                         help="decoder input dimension")
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument("--decoder-layers",
                         type=int,
                         metavar="N",
                         help="num decoder layers")
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         default=False,
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--adaptive-softmax-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive softmax cutoff points. "
         "Must be used with adaptive_loss criterion",
     )
     parser.add_argument(
         "--adaptive-softmax-dropout",
         type=float,
         metavar="D",
         help="sets adaptive softmax dropout for the tail projections",
     )
     parser.add_argument(
         "--adaptive-softmax-factor",
         type=float,
         metavar="N",
         help="adaptive input factor",
     )
     parser.add_argument(
         "--no-token-positional-embeddings",
         default=False,
         action="store_true",
         help=
         "if set, disables positional embeddings (outside self attention)",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         default=False,
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--character-embeddings",
         default=False,
         action="store_true",
         help=
         "if set, uses character embedding convolutions to produce token embeddings",
     )
     parser.add_argument(
         "--character-filters",
         type=str,
         metavar="LIST",
         default=
         "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
         help="size of character embeddings",
     )
     parser.add_argument(
         "--character-embedding-dim",
         type=int,
         metavar="N",
         default=4,
         help="size of character embeddings",
     )
     parser.add_argument(
         "--char-embedder-highway-layers",
         type=int,
         metavar="N",
         default=2,
         help="number of highway layers for character token embeddder",
     )
     parser.add_argument(
         "--adaptive-input",
         default=False,
         action="store_true",
         help="if set, uses adaptive input",
     )
     parser.add_argument(
         "--adaptive-input-factor",
         type=float,
         metavar="N",
         help="adaptive input factor",
     )
     parser.add_argument(
         "--adaptive-input-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive input cutoff points.",
     )
     parser.add_argument(
         "--tie-adaptive-weights",
         action="store_true",
         help=
         "if set, ties the weights of adaptive softmax and adaptive input",
     )
     parser.add_argument(
         "--tie-adaptive-proj",
         action="store_true",
         help=
         "if set, ties the projection weights of adaptive softmax and adaptive input",
     )
     parser.add_argument(
         "--decoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the decoder",
     )
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         "--decoder-kernel-size-list",
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")',
     )
     parser.add_argument("--decoder-glu",
                         type=utils.eval_bool,
                         help="glu after in proj")
     parser.add_argument(
         "--decoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument("--weight-softmax",
                         default=True,
                         type=utils.eval_bool)
     parser.add_argument(
         "--weight-dropout",
         type=float,
         metavar="D",
         help="dropout probability for conv weights",
     )
Esempio n. 9
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        if args.encoder_layers != args.decoder_layers:
            raise ValueError("--encoder-layers must match --decoder-layers")

        max_source_positions = getattr(args, "max_source_positions",
                                       DEFAULT_MAX_SOURCE_POSITIONS)
        max_target_positions = getattr(args, "max_target_positions",
                                       DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.encoder_embed_path:
            pretrained_encoder_embed = load_pretrained_embedding_from_file(
                args.encoder_embed_path, task.source_dictionary,
                args.encoder_embed_dim)
        else:
            num_embeddings = len(task.source_dictionary)
            pretrained_encoder_embed = Embedding(num_embeddings,
                                                 args.encoder_embed_dim,
                                                 task.source_dictionary.pad())

        if args.share_all_embeddings:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError(
                    "--share-all-embeddings requires a joint dictionary")
            if args.decoder_embed_path and (args.decoder_embed_path !=
                                            args.encoder_embed_path):
                raise ValueError(
                    "--share-all-embed not compatible with --decoder-embed-path"
                )
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    "--share-all-embeddings requires --encoder-embed-dim to "
                    "match --decoder-embed-dim")
            pretrained_decoder_embed = pretrained_encoder_embed
            args.share_decoder_input_output_embed = True
        else:
            # separate decoder input embeddings
            pretrained_decoder_embed = None
            if args.decoder_embed_path:
                pretrained_decoder_embed = load_pretrained_embedding_from_file(
                    args.decoder_embed_path,
                    task.target_dictionary,
                    args.decoder_embed_dim,
                )
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
                args.decoder_embed_dim != args.decoder_out_embed_dim):
            raise ValueError(
                "--share-decoder-input-output-embeddings requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim")

        if args.encoder_freeze_embed:
            pretrained_encoder_embed.weight.requires_grad = False
        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        encoder = LSTMEncoder(
            dictionary=task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            hidden_size=args.encoder_hidden_size,
            num_layers=args.encoder_layers,
            dropout_in=args.encoder_dropout_in,
            dropout_out=args.encoder_dropout_out,
            bidirectional=args.encoder_bidirectional,
            pretrained_embed=pretrained_encoder_embed,
            max_source_positions=max_source_positions,
        )
        decoder = LSTMDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=utils.eval_bool(args.decoder_attention),
            encoder_output_units=encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
            residuals=False,
        )
        return cls(encoder, decoder)
Esempio n. 10
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            LightConvDecoderLayer(args,
                                  no_encoder_attn,
                                  kernel_size=args.decoder_kernel_size_list[i])
            for i in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = (Linear(embed_dim, output_embed_dim, bias=False)
                                if embed_dim != output_embed_dim
                                and not args.tie_adaptive_weights else None)

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
Esempio n. 11
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument("--dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability")
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after ReLU in FFN",
     )
     parser.add_argument(
         "--input-dropout",
         type=float,
         metavar="D",
         help="dropout probability of the inputs",
     )
     parser.add_argument(
         "--encoder-embed-path",
         type=str,
         metavar="STR",
         help="path to pre-trained encoder embedding",
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-conv-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument("--encoder-layers",
                         type=int,
                         metavar="N",
                         help="num encoder layers")
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="N",
         help="num encoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--encoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the encoder",
     )
     parser.add_argument(
         "--decoder-embed-path",
         type=str,
         metavar="STR",
         help="path to pre-trained decoder embedding",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-conv-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument("--decoder-layers",
                         type=int,
                         metavar="N",
                         help="num decoder layers")
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--decoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the decoder",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--share-all-embeddings",
         action="store_true",
         help="share encoder, decoder and output embeddings"
         " (requires shared dictionary and embed dim)",
     )
     parser.add_argument(
         "--adaptive-softmax-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive softmax cutoff points. "
         "Must be used with adaptive_loss criterion",
     ),
     parser.add_argument(
         "--adaptive-softmax-dropout",
         type=float,
         metavar="D",
         help="sets adaptive softmax dropout for the tail projections",
     )
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         "--encoder-kernel-size-list",
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31,31]")',
     )
     parser.add_argument(
         "--decoder-kernel-size-list",
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")',
     )
     parser.add_argument("--encoder-glu",
                         type=utils.eval_bool,
                         help="glu after in proj")
     parser.add_argument("--decoder-glu",
                         type=utils.eval_bool,
                         help="glu after in proj")
     parser.add_argument(
         "--encoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument(
         "--decoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument("--weight-softmax",
                         default=True,
                         type=utils.eval_bool)
     parser.add_argument(
         "--weight-dropout",
         type=float,
         metavar="D",
         help="dropout probability for conv weights",
     )