Ejemplo n.º 1
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--relu-dropout",
            type=float,
            metavar="D",
            help="dropout probability after ReLU in FFN",
        )
        parser.add_argument(
            "--encoder-pretrained-embed",
            type=str,
            metavar="STR",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="num encoder layers")
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            default=False,
            action="store_true",
            help="apply layernorm before each encoder block",
        )
        parser.add_argument(
            "--encoder-learned-pos",
            default=False,
            action="store_true",
            help="use learned positional embeddings in the encoder",
        )
        parser.add_argument(
            "--decoder-pretrained-embed",
            type=str,
            metavar="STR",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="num decoder layers")
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads",
        )
        parser.add_argument(
            "--decoder-reduced-attention-dim",
            type=int,
            default=None,
            metavar="N",
            help="if specified, computes attention with this dimensionality "
            "(instead of using encoder output dims)",
        )
        parser.add_argument(
            "--decoder-lstm-units",
            type=int,
            metavar="N",
            help="num LSTM units for each decoder layer",
        )
        parser.add_argument(
            "--decoder-out-embed-dim",
            default=None,
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
Ejemplo n.º 2
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--relu-dropout",
            type=float,
            metavar="D",
            help="dropout probability after ReLU in FFN",
        )
        parser.add_argument(
            "--encoder-pretrained-embed",
            type=str,
            metavar="STR",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="num encoder layers")
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            default=False,
            action="store_true",
            help="apply layernorm before each encoder block",
        )
        parser.add_argument(
            "--encoder-learned-pos",
            default=False,
            action="store_true",
            help="use learned positional embeddings in the encoder",
        )
        parser.add_argument(
            "--decoder-pretrained-embed",
            type=str,
            metavar="STR",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="num decoder layers")
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads",
        )
        parser.add_argument(
            "--decoder-learned-pos",
            default=False,
            action="store_true",
            help="use learned positional embeddings in the decoder",
        )
        parser.add_argument(
            "--decoder-normalize-before",
            default=False,
            action="store_true",
            help="apply layernorm before each decoder block",
        )
        parser.add_argument(
            "--share-decoder-input-output-embed",
            default=False,
            action="store_true",
            help="share decoder input and output embeddings",
        )
        parser.add_argument(
            "--share-all-embeddings",
            default=False,
            action="store_true",
            help="share encoder, decoder and output embeddings"
            " (requires shared dictionary and embed dim)",
        )
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            default=None,
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion",
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
Ejemplo n.º 3
0
    def add_args(parser):
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            metavar="D",
            help="dropout probability",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            default=0,
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="encoder cell num units")
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="number of encoder layers")
        parser.add_argument(
            "--encoder-bidirectional",
            action="store_true",
            help="whether the first layer is bidirectional or not",
        )
        parser.add_argument(
            "--averaging-encoder",
            default=False,
            action="store_true",
            help=("whether use mean encoder hidden states as decoder initial "
                  "states or not"),
        )
        parser.add_argument(
            "--decoder-embed-dim",
            default=0,
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the decoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="decoder cell num units")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument(
            "--decoder-out-embed-dim",
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )
        parser.add_argument(
            "--decoder-out-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained decoder output embedding",
        )
        parser.add_argument(
            "--decoder-tie-embeddings",
            default=False,
            action="store_true",
            help="tie the decoder word embeddings with the output projection "
            "weights (requires that the embedding dims be of the same size)",
        )
        parser.add_argument(
            "--attention-type",
            type=str,
            metavar="EXPR",
            help="decoder attention, defaults to dot",
        )
        parser.add_argument(
            "--residual-level",
            default=None,
            type=int,
            help=
            ("First layer where to apply a residual connection. "
             "The value should be greater than 0 and smaller than the number of "
             "layers."),
        )
        parser.add_argument(
            "--cell-type",
            default="lstm",
            type=str,
            metavar="EXPR",
            help=
            "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder input embedding",
        )
        parser.add_argument(
            "--encoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder output",
        )
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding",
        )
        parser.add_argument(
            "--decoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for decoder output",
        )
        parser.add_argument(
            "--sequence-lstm",
            action="store_true",
            help="use nn.LSTM implementation for encoder",
        )
        parser.add_argument(
            "--ngram-decoder",
            default=None,
            type=int,
            nargs="+",
            help=(
                "A single integer, or a list of integers. If "
                "positive, the decoder is not recurrent but a feedforward "
                "network with target-side n-gram history as input. The decoder "
                "is still conditioned on the source side via attention. If "
                "this parameter is a list of integers, the n-th entry applies "
                "to the n-th decoder (for multilingual models and "
                "multi-decoders)"),
        )
        parser.add_argument(
            "--ngram-activation-type",
            default="relu",
            type=str,
            metavar="EXPR",
            help=("Activation in FF layers of the ngram decoder, defaults to "
                  "relu, values: relu, tanh"),
        )
        parser.add_argument(
            "--multi-encoder",
            default=None,
            type=int,
            help=(
                "If this is positive, train n encoder networks rather than "
                "only one. The outputs of the encoders are concatenated before "
                "passing them through to the decoder."),
        )
        parser.add_argument(
            "--multi-decoder",
            default=None,
            type=int,
            help=("If this is positive, train n decoder networks rather than "
                  "only one. The predictions are combined via the method in "
                  "--multi-decoder-combination-strategy."),
        )
        parser.add_argument(
            "--multi-decoder-combination-strategy",
            default="bottleneck",
            type=str,
            metavar="EXPR",
            help=(
                "Only used if --multi-decoder is positive. Controls how the "
                "decoders are combined with each other.\n"
                "- uniform: Separate projection layers, average predictions\n"
                "- uniform-probspace: Separate projection layers, average "
                "in probability space.\n"
                "- uniform-logprobspace: Separate projection layers, average "
                "in log-probability space.\n"
                "- unprojected: Shared projection layer, unprojected "
                "decoder outputs are averaged.\n"
                "- deepfusion: cf. https://arxiv.org/pdf/1503.03535.pdf \n"
                "- coldfusion: cf. https://arxiv.org/pdf/1708.06426.pdf \n"
                "- weighted: Separate projection layers, weighted average "
                "of logits. Weights are learned from unprojected decoder "
                "outputs.\n"
                "- weighted-probspace: Like 'weighted', but average in "
                "probability space.\n"
                "- weighted-logprobspace: Like 'weighted', but average in "
                "log-probability space.\n"
                "- weighted-unprojected: Shared projection layer, weighted "
                "average of decoder outputs. Weights are learned from "
                "unprojected decoder outputs.\n"
                "- concat: Shared projection layer, decoder outputs are "
                "concatenated.\n"
                "- bottleneck: Like 'concat' but with an additional "
                "bottleneck layer to reduce the size of the output embedding "
                "matrix.\n"
                "- deep_bottleneck: Like 'bottleneck' but with an additional "
                "non-linear layer.\n"
                "- multiplicative-unprojected: Shared projection layer, element"
                "-wise product of decoder outputs after ReLU.\n"
                "- max-unprojected: Shared projection layer, element"
                "-wise max of decoder outputs.\n"),
        )
        parser.add_argument(
            "--multi-model-fixed-weights",
            default=None,
            type=float,
            nargs="+",
            help=(
                "Used for weighted* combination strategies. If specified, use "
                "these fixed model weights rather than a gating network."),
        )
        parser.add_argument(
            "--multi-model-training-schedule",
            default="complete",
            type=str,
            metavar="EXPR",
            help=
            ("Only used if --multi-decoder is positive.\n"
             "- 'complete': Jointly train entire network on all batches.\n"
             "- 'unfreeze_single': Freeze all submodels except one for each "
             "training batch.\n"
             "- 'unfreeze_single_encoder': Freeze all encoders except one "
             "for each training batch.\n"
             "- 'unfreeze_single_decoder': Freeze all decoders except one "
             "for each training batch.\n"
             "- 'unfreeze_enc_N': Freeze N-th encoder.\n"
             "- 'unfreeze_dec_N': Freeze N-th decoder.\n"
             "- 'unfreeze_encdec_N': Freeze N-th encoder and N-th decoder.\n"
             "- 'freeze_all': Freeze all submodels, only train combination "
             "strategy.\n"
             "- 'freeze_all_encoders': Freeze all encoders.\n"
             "- 'freeze_all_decoders': Freeze all decoders.\n"
             "- 'separate': Each training batch is used for only one of the "
             "following: Train the n-th submodel, or train combination "
             "strategy."),
        )
        parser.add_argument(
            "--multi-decoder-is-lm",
            default=None,
            type=int,
            nargs="+",
            help=
            ("If specified, sets --attention-type=no and --encoder-hidden-dim=0"
             "for the n-th decoder in an adaptive ensemble."),
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
        # Args for word dropout
        word_dropout.add_args(parser)
Ejemplo n.º 4
0
    def add_args(parser):
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            metavar="D",
            help="dropout probability",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="encoder cell num units")
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="number of encoder layers")
        parser.add_argument(
            "--encoder-bidirectional",
            action="store_true",
            help="whether the first layer is bidirectional or not",
        )
        parser.add_argument(
            "--averaging-encoder",
            default=False,
            action="store_true",
            help=("whether use mean encoder hidden states as decoder initial "
                  "states or not"),
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="decoder cell num units")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument(
            "--decoder-out-embed-dim",
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )
        parser.add_argument(
            "--attention-type",
            type=str,
            metavar="EXPR",
            help="decoder attention, defaults to dot",
        )
        parser.add_argument(
            "--residual-level",
            default=None,
            type=int,
            help=
            ("First layer where to apply a residual connection. "
             "The value should be greater than 0 and smaller than the number of "
             "layers."),
        )
        parser.add_argument(
            "--cell-type",
            default="lstm",
            type=str,
            metavar="EXPR",
            help=
            "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder input embedding",
        )
        parser.add_argument(
            "--encoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder output",
        )
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding",
        )
        parser.add_argument(
            "--decoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for decoder output",
        )
        parser.add_argument(
            "--sequence-lstm",
            action="store_true",
            help="use nn.LSTM implementation for encoder",
        )
        parser.add_argument(
            "--ngram-decoder",
            default=None,
            type=int,
            help=(
                "If this is positive, we use an n-gram based feedforward "
                "network in the decoder rather than recurrence. The decoder is "
                "still conditioned on the source side via attention."),
        )
        parser.add_argument(
            "--ngram-activation-type",
            default="relu",
            type=str,
            metavar="EXPR",
            help=("Activation in FF layers of the ngram decoder, defaults to "
                  "relu, values: relu, tanh"),
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
        # Args for word dropout
        word_dropout.add_args(parser)
Ejemplo n.º 5
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--relu-dropout",
            type=float,
            metavar="D",
            help="dropout probability after ReLU in FFN",
        )
        parser.add_argument(
            "--encoder-embed-path",
            type=str,
            metavar="STR",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="num encoder layers")
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            action="store_true",
            help="apply layernorm before each encoder block",
        )
        parser.add_argument(
            "--encoder-learned-pos",
            action="store_true",
            help="use learned positional embeddings in the encoder",
        )
        parser.add_argument(
            "--decoder-embed-path",
            type=str,
            metavar="STR",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension for FFN",
        )
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="num decoder layers")
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads",
        )
        parser.add_argument(
            "--decoder-learned-pos",
            action="store_true",
            help="use learned positional embeddings in the decoder",
        )
        parser.add_argument(
            "--decoder-normalize-before",
            action="store_true",
            help="apply layernorm before each decoder block",
        )
        parser.add_argument(
            "--share-decoder-input-output-embed",
            action="store_true",
            help="share decoder input and output embeddings",
        )
        parser.add_argument(
            "--share-all-embeddings",
            action="store_true",
            help="share encoder, decoder and output embeddings"
            " (requires shared dictionary and embed dim)",
        )
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion",
        ),
        parser.add_argument(
            "--adaptive-softmax-dropout",
            type=float,
            metavar="D",
            help="sets adaptive softmax dropout for the tail projections",
        )
        # AAN only
        parser.add_argument(
            "--decoder-attn-window-size",
            default=0,
            type=int,
            help=
            "attention window size of the decoder (default: 0 (unlimited))",
        )
        parser.add_argument(
            "--no-decoder-aan-ffn",
            default=False,
            action="store_true",
            help="no FFN in the AAN block",
        )
        parser.add_argument(
            "--no-decoder-aan-gating",
            default=False,
            action="store_true",
            help="no Gating in the AAN block",
        )
        parser.add_argument(
            "--decoder-aan-ffn-use-embed-dim",
            default=False,
            action="store_true",
            help="""using decoder_embed_dim instead of decoder_ffn_embed_dim \
            as the hidden size of the FFN in AAN""",
        )
        parser.add_argument(
            "--decoder-aan-more-dropouts",
            type=lambda x: set(x.split(",")),
            help=
            """places to add more dropout in AAN, accepting multiple values in \
            [residual/after_avg/after_aan] separated by commas""",
        )
        parser.add_argument(
            "--decoder-out-embed-dim",
            default=None,
            type=int,
            metavar="N",
            help="decoder output embedding dimension (bottleneck layer before"
            "output layer if specified.)",
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
Ejemplo n.º 6
0
    def add_args(parser):
        parser.add_argument(
            '--dropout',
            default=0.1,
            type=float,
            metavar='D',
            help='dropout probability',
        )
        parser.add_argument(
            '--encoder-embed-dim',
            type=int,
            metavar='N',
            help='encoder embedding dimension',
        )
        parser.add_argument(
            '--encoder-freeze-embed',
            default=False,
            action='store_true',
            help=('whether to freeze the encoder embedding or allow it to be '
                  'updated during training'),
        )
        parser.add_argument(
            '--encoder-hidden-dim',
            type=int,
            metavar='N',
            help='encoder cell num units',
        )
        parser.add_argument(
            '--encoder-layers',
            type=int,
            metavar='N',
            help='number of encoder layers',
        )
        parser.add_argument(
            '--encoder-bidirectional',
            action='store_true',
            help='whether the first layer is bidirectional or not',
        )
        parser.add_argument(
            '--averaging-encoder',
            default=False,
            action='store_true',
            help=('whether use mean encoder hidden states as decoder initial '
                  'states or not'),
        )
        parser.add_argument(
            '--add-encoder-outputs-as-decoder-input',
            default=False,
            action='store_true',
            help=('whether use max encoder hidden states as constant decoder '
                  'input'),
        )
        parser.add_argument(
            '--decoder-embed-dim',
            type=int,
            metavar='N',
            help='decoder embedding dimension',
        )
        parser.add_argument(
            '--decoder-freeze-embed',
            default=False,
            action='store_true',
            help=('whether to freeze the encoder embedding or allow it to be '
                  'updated during training'),
        )
        parser.add_argument(
            '--decoder-hidden-dim',
            type=int,
            metavar='N',
            help='decoder cell num units',
        )
        parser.add_argument(
            '--decoder-layers',
            type=int,
            metavar='N',
            help='number of decoder layers',
        )
        parser.add_argument(
            '--decoder-out-embed-dim',
            type=int,
            metavar='N',
            help='decoder output embedding dimension',
        )
        parser.add_argument(
            '--attention-type',
            type=str,
            metavar='EXPR',
            help='decoder attention, defaults to dot',
        )
        parser.add_argument(
            '--residual-level',
            default=None,
            type=int,
            help=
            ('First layer where to apply a residual connection. '
             'The value should be greater than 0 and smaller than the number of '
             'layers.'),
        )
        parser.add_argument(
            '--cell-type',
            default='lstm',
            type=str,
            metavar='EXPR',
            help=
            'cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm',
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            '--encoder-dropout-in',
            type=float,
            metavar='D',
            help='dropout probability for encoder input embedding',
        )
        parser.add_argument(
            '--encoder-dropout-out',
            type=float,
            metavar='D',
            help='dropout probability for encoder output',
        )
        parser.add_argument(
            '--decoder-dropout-in',
            type=float,
            metavar='D',
            help='dropout probability for decoder input embedding',
        )
        parser.add_argument(
            '--decoder-dropout-out',
            type=float,
            metavar='D',
            help='dropout probability for decoder output',
        )
        parser.add_argument(
            '--sequence-lstm',
            action='store_true',
            help='use nn.LSTM implementation for encoder',
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
        # Args for word dropout
        word_dropout.add_args(parser)
Ejemplo n.º 7
0
    def add_args(parser):
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            metavar="D",
            help="dropout probability",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="encoder cell num units")
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="number of encoder layers")
        parser.add_argument(
            "--encoder-bidirectional",
            action="store_true",
            help="whether the first layer is bidirectional or not",
        )
        parser.add_argument(
            "--averaging-encoder",
            default=False,
            action="store_true",
            help=("whether use mean encoder hidden states as decoder initial "
                  "states or not"),
        )
        parser.add_argument(
            "--add-encoder-outputs-as-decoder-input",
            default=False,
            action="store_true",
            help=("whether use max encoder hidden states as constant decoder "
                  "input"),
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="decoder cell num units")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument(
            "--decoder-out-embed-dim",
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )
        parser.add_argument(
            "--attention-type",
            type=str,
            metavar="EXPR",
            help="decoder attention, defaults to dot",
        )
        parser.add_argument(
            "--residual-level",
            default=None,
            type=int,
            help=
            ("First layer where to apply a residual connection. "
             "The value should be greater than 0 and smaller than the number of "
             "layers."),
        )
        parser.add_argument(
            "--cell-type",
            default="lstm",
            type=str,
            metavar="EXPR",
            help=
            "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder input embedding",
        )
        parser.add_argument(
            "--encoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder output",
        )
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding",
        )
        parser.add_argument(
            "--decoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for decoder output",
        )
        parser.add_argument(
            "--sequence-lstm",
            action="store_true",
            help="use nn.LSTM implementation for encoder",
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
        # Args for word dropout
        word_dropout.add_args(parser)
        # Args for character RNN encoder
        char_rnn_encoder.add_args(parser)
Ejemplo n.º 8
0
    def add_args(parser):
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            metavar="D",
            help="dropout probability",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--encoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="encoder cell num units")
        parser.add_argument("--encoder-layers",
                            type=int,
                            metavar="N",
                            help="number of encoder layers")
        parser.add_argument(
            "--encoder-bidirectional",
            action="store_true",
            help="whether the first layer is bidirectional or not",
        )
        parser.add_argument(
            "--averaging-encoder",
            default=False,
            action="store_true",
            help=("whether use mean encoder hidden states as decoder initial "
                  "states or not"),
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=("whether to freeze the encoder embedding or allow it to be "
                  "updated during training"),
        )
        parser.add_argument("--decoder-hidden-dim",
                            type=int,
                            metavar="N",
                            help="decoder cell num units")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument(
            "--decoder-out-embed-dim",
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )
        parser.add_argument(
            "--attention-type",
            type=str,
            metavar="EXPR",
            help="decoder attention, defaults to dot",
        )
        parser.add_argument(
            "--residual-level",
            default=None,
            type=int,
            help=
            ("First layer where to apply a residual connection. "
             "The value should be greater than 0 and smaller than the number of "
             "layers."),
        )
        parser.add_argument(
            "--cell-type",
            default="lstm",
            type=str,
            metavar="EXPR",
            help=
            "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder input embedding",
        )
        parser.add_argument(
            "--encoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder output",
        )
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding",
        )
        parser.add_argument(
            "--decoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for decoder output",
        )
        parser.add_argument(
            "--sequence-lstm",
            action="store_true",
            help="use nn.LSTM implementation for encoder",
        )
        parser.add_argument(
            "--ngram-decoder",
            default=None,
            type=int,
            nargs="+",
            help=(
                "A single integer, or a list of integers. If "
                "positive, the decoder is not recurrent but a feedforward "
                "network with target-side n-gram history as input. The decoder "
                "is still conditioned on the source side via attention. If "
                "this parameter is a list of integers, the n-th entry applies "
                "to the n-th decoder (for multilingual models and "
                "multi-decoders)"),
        )
        parser.add_argument(
            "--ngram-activation-type",
            default="relu",
            type=str,
            metavar="EXPR",
            help=("Activation in FF layers of the ngram decoder, defaults to "
                  "relu, values: relu, tanh"),
        )
        parser.add_argument(
            "--multi-encoder",
            default=None,
            type=int,
            help=(
                "If this is positive, train n encoder networks rather than "
                "only one. The outputs of the encoders are concatenated before "
                "passing them through to the decoder."),
        )
        parser.add_argument(
            "--multi-decoder",
            default=None,
            type=int,
            help=("If this is positive, train n decoder networks rather than "
                  "only one. The predictions are combined via the method in "
                  "--multi-decoder-combination-strategy."),
        )
        parser.add_argument(
            "--multi-decoder-combination-strategy",
            default="bottleneck",
            type=str,
            metavar="EXPR",
            help=(
                "Only used if --multi-decoder is positive. Controls how the "
                "decoders are combined with each other.\n"
                "- uniform: Separate projection layers, average predictions\n"
                "- uniform-probspace: Separate projection layers, average "
                "in probability space.\n"
                "- unprojected: Shared projection layer, unprojected "
                "decoder outputs are averaged.\n"
                "- weighted: Separate projection layers, weighted average "
                "of logits. Weights are learned from unprojected decoder "
                "outputs.\n"
                "- weighted-probspace: Like 'weighted', but average in "
                "probability space.\n"
                "- weighted-unprojected: Shared projection layer, weighted "
                "average of decoder outputs. Weights are learned from "
                "unprojected decoder outputs.\n"
                "- concat: Shared projection layer, decoder outputs are "
                "concatenated.\n"
                "- bottleneck: Like 'concat' but with an additional "
                "bottleneck layer to reduce the size of the output embedding "
                "matrix.\n"
                "- multiplicative-unprojected: Shared projection layer, element"
                "-wise product of decoder outputs after ReLU.\n"),
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)
        # Args for word dropout
        word_dropout.add_args(parser)