コード例 #1
0
 def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError("Please install fairscale with: pip install fairscale")
     self.use_pipeline = encoder_module_list is not None
     if not self.use_pipeline:
         self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens)
         self.encoder_layers = nn.Sequential(*[TransformerEncoderLayer(args) for i in range(args.encoder_layers)])
         if isinstance(embed_tokens, nn.ModuleList):
             emb_dim = sum(e.embedding_dim for e in embed_tokens)
         else:
             emb_dim = embed_tokens.embedding_dim
         self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
     else:
         encoder_balance = utils.eval_str_list(
             args.pipeline_encoder_balance, type=int
         )
         encoder_devices = utils.eval_str_list(
             args.pipeline_encoder_devices, type=int
         )
         assert sum(encoder_balance) == len(encoder_module_list), (
             f"Sum of encoder_balance={encoder_balance} is not equal "
             + f"to num_encoder_modules={len(encoder_module_list)}"
         )
         self.model = Pipe(
             module=nn.Sequential(*encoder_module_list),
             balance=encoder_balance,
             devices=encoder_devices,
             chunks=args.pipeline_chunks,
             checkpoint=args.pipeline_checkpoint,
         )
コード例 #2
0
 def build_model(cls, args, task):
     encoder, decoder = cls.build_model_base(args, task)
     return PipelineParallelTransformerModel(
         encoder=encoder,
         decoder=decoder,
         balance=utils.eval_str_list(args.pipeline_balance, type=int),
         devices=utils.eval_str_list(args.pipeline_devices, type=int),
         chunks=args.pipeline_chunks,
         checkpoint=args.pipeline_checkpoint,
     )
コード例 #3
0
        def get_decoder(lang):
            if lang not in lang_decoders:
                if shared_decoder_embed_tokens is not None:
                    decoder_embed_tokens = shared_decoder_embed_tokens
                else:
                    decoder_embed_tokens = build_embedding(
                        task.dicts[lang], cfg.decoder_embed_dim,
                        cfg.decoder_embed_path)

                lang_decoders[lang] = RNNDecoder(
                    dictionary=task.dicts[lang],
                    embed_dim=cfg.decoder_embed_dim,
                    hidden_size=cfg.decoder_hidden_size,
                    out_embed_dim=cfg.decoder_out_embed_dim,
                    num_layers=cfg.decoder_layers,
                    attention_type=cfg.attention_type,
                    dropout_in=(cfg.decoder_dropout_in if
                                cfg.decoder_dropout_in >= 0 else cfg.dropout),
                    dropout_out=(cfg.decoder_dropout_out
                                 if cfg.decoder_dropout_out >= 0 else
                                 cfg.dropout),
                    rnn_type=cfg.rnn_type,
                    encoder_output_units=cfg.encoder_hidden_size,
                    pretrained_embed=decoder_embed_tokens,
                    share_input_output_embed=cfg.
                    share_decoder_input_output_embed,
                    adaptive_softmax_cutoff=(utils.eval_str_list(
                        cfg.adaptive_softmax_cutoff, type=int) if cfg.criterion
                                             == "adaptive_loss" else None),
                    max_target_positions=cfg.max_target_positions,
                    residuals=False,
                )
            return lang_decoders[lang]
コード例 #4
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        TransformerModel.add_args(parser)
        parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s out channels")
        parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s kernel sizes")
        parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s strides")
        parser.add_argument("--encoder-transformer-context", type=str, metavar="EXPR",
                            help="left/right context for time-restricted self-attention; "
                            "can be None or a tuple of two non-negative integers/None")
        parser.add_argument("--decoder-input-dim", type=int, metavar="N",
                            help="decoder input dimension (extra linear layer "
                                 "if different from decoder embed dim)")

        # Scheduled sampling options
        parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p),
                            metavar="P_1,P_2,...,P_N", default=[1.0],
                            help="scheduled sampling probabilities of sampling the truth "
                            "labels for N epochs starting from --start-schedule-sampling-epoch; "
                            "all later epochs using P_N")
        parser.add_argument("--start-scheduled-sampling-epoch", type=int,
                            metavar="N", default=1,
                            help="start scheduled sampling from the specified epoch")
コード例 #5
0
ファイル: options.py プロジェクト: Tvicker/espresso
def add_optimization_args(parser):
    group = parser.add_argument_group("Optimization")
    # fmt: off
    group.add_argument('--max-epoch',
                       '--me',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument(
        '--stop-time-hours',
        default=0,
        type=float,
        metavar='N',
        help='force stop training after specified cumulative time (if >0)')
    group.add_argument('--clip-norm',
                       default=0.0,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--sentence-avg',
        action='store_true',
        help='normalize gradients by the number of sentences in a batch'
        ' (default is to normalize by number of tokens)')
    group.add_argument(
        '--update-freq',
        default='1',
        metavar='N1,N2,...,N_K',
        type=lambda uf: eval_str_list(uf, type=int),
        help='update parameters every N_i batches, when in epoch i')
    group.add_argument(
        '--lr',
        '--learning-rate',
        default='0.25',
        type=eval_str_list,
        metavar='LR_1,LR_2,...,LR_N',
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument(
        '--min-lr',
        default=-1,
        type=float,
        metavar='LR',
        help='stop training when the learning rate reaches this minimum')
    group.add_argument(
        '--use-bmuf',
        default=False,
        action='store_true',
        help=
        'specify global optimizer for syncing models on different GPUs/shards')
    # fmt: on
    return group
コード例 #6
0
ファイル: lstm_lm.py プロジェクト: veralily/fairseq
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        if getattr(args, "max_target_positions", None) is not None:
            max_target_positions = args.max_target_positions
        else:
            max_target_positions = getattr(args, "tokens_per_sample",
                                           DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path, task.target_dictionary,
                args.decoder_embed_dim)

        if args.share_decoder_input_output_embed:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError(
                    "--share-decoder-input-output-embeddings requires a joint dictionary"
                )

            if args.decoder_embed_dim != args.decoder_out_embed_dim:
                raise ValueError(
                    "--share-decoder-input-output-embeddings requires "
                    "--decoder-embed-dim to match --decoder-out-embed-dim")

        decoder = LSTMDecoder(
            dictionary=task.dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=
            False,  # decoder-only language model doesn't support attention
            encoder_output_units=0,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
            residuals=args.residuals,
        )

        return cls(decoder)
コード例 #7
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if hasattr(args, "max_target_positions") and not hasattr(
                args, "tokens_per_sample"):
            args.tokens_per_sample = args.max_target_positions

        decoder = FConvDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.tokens_per_sample,
            share_embed=False,
            positional_embeddings=False,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            adaptive_softmax_dropout=args.adaptive_softmax_dropout,
        )
        return FConvLanguageModel(decoder)
コード例 #8
0
 def build_decoder(cls, args, tgt_dict, embed_tokens):
     #return TransformerDecoder(
     #    args,
     #    tgt_dict,
     #    embed_tokens,
     #    no_encoder_attn=getattr(args, "no_cross_attention", False),
     #)
     return GRUDecoder(
         args=args,
         dictionary=tgt_dict,
         embed_dim=args.decoder_embed_dim,
         hidden_size=args.decoder_hidden_size,
         out_embed_dim=args.decoder_out_embed_dim,
         num_layers=args.decoder_layers,
         dropout_in=args.decoder_dropout_in,
         dropout_out=args.decoder_dropout_out,
         attention=not getattr(args, "no_cross_attention", False),
         encoder_output_units=getattr(args, "encoder_embed_dim", None),
         pretrained_embed=embed_tokens,
         share_input_output_embed=args.share_decoder_input_output_embed,
         adaptive_softmax_cutoff=(utils.eval_str_list(
             args.adaptive_softmax_cutoff, type=int) if args.criterion
                                  == "adaptive_loss" else None),
         max_target_positions=args.max_target_positions,
         residuals=False,
     )
コード例 #9
0
ファイル: lstm_lm.py プロジェクト: medbar/espresso
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if getattr(args, "max_target_positions", None) is not None:
            max_target_positions = args.max_target_positions
        else:
            max_target_positions = getattr(args, "tokens_per_sample",
                                           DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.is_wordlm and hasattr(task, "word_dictionary"):
            dictionary = task.word_dictionary
        elif isinstance(task, SpeechRecognitionEspressoTask):
            dictionary = task.target_dictionary
        else:
            dictionary = task.source_dictionary

        # separate decoder input embeddings
        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path, dictionary, args.decoder_embed_dim)
        # one last double check of parameter combinations
        if args.share_embed and (args.decoder_embed_dim !=
                                 args.decoder_out_embed_dim):
            raise ValueError(
                "--share-embed requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim")

        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        decoder = SpeechLSTMDecoder(
            dictionary=dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attn_type=None,
            encoder_output_units=0,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_embed,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
        )
        return cls(decoder, args)
コード例 #10
0
 def build_output_projection(self, args, dictionary, embed_tokens):
     if args.adaptive_softmax_cutoff is not None:
         self.adaptive_softmax = AdaptiveSoftmax(
             len(dictionary),
             self.output_embed_dim,
             utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
             dropout=args.adaptive_softmax_dropout,
             adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
             factor=args.adaptive_softmax_factor,
             tie_proj=args.tie_adaptive_proj,
         )
     elif self.share_input_output_embed:
         self.output_projection = nn.Linear(
             self.embed_tokens.weight.shape[1],
             self.embed_tokens.weight.shape[0],
             bias=False,
         )
         self.output_projection.weight = self.embed_tokens.weight
     else:
         self.output_projection = nn.Linear(
             self.output_embed_dim, len(dictionary), bias=False
         )
         nn.init.normal_(
             self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
         )
     num_base_layers = getattr(args, "base_layers", 0)
     for i in range(num_base_layers):
         self.layers.insert(((i+1) * args.decoder_layers) // (num_base_layers + 1), BaseLayer(args))
コード例 #11
0
def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig):
    from fairseq import utils

    balance_exists = (
        cfg.pipeline_balance is not None
        or cfg.pipeline_encoder_balance is not None
        or cfg.pipeline_decoder_balance is not None
    )
    devices_exist = (
        cfg.pipeline_devices is not None
        or cfg.pipeline_encoder_devices is not None
        or cfg.pipeline_decoder_devices is not None
    )
    if not balance_exists:
        raise ValueError(
            "--pipeline-balance is currently required for pipeline model parallelism"
        )
    if not devices_exist:
        raise ValueError(
            "--pipeline-devices is currently required for pipeline model parallelism"
        )

    cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int)
    if cfg.pipeline_devices is not None:
        cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int)
        num_pipeline_devices = len(set(cfg.pipeline_devices))
    else:
        cfg.pipeline_encoder_devices = utils.eval_str_list(
            cfg.pipeline_encoder_devices, type=int
        )
        cfg.pipeline_decoder_devices = utils.eval_str_list(
            cfg.pipeline_decoder_devices, type=int
        )
        num_pipeline_devices = len(
            set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)
        )
    gpus_per_node = torch.cuda.device_count()
    assert (
        gpus_per_node >= num_pipeline_devices
        and gpus_per_node % num_pipeline_devices == 0
    ), (
        "the number of unique device IDs in --pipeline-devices must evenly divide "
        "the number of GPUs per node (multi-node pipelining is not yet supported)"
    )
    num_pipelines_per_node = gpus_per_node // num_pipeline_devices
    return num_pipeline_devices, num_pipelines_per_node
コード例 #12
0
 def __init__(
     self,
     args,
     dictionary,
     embed_tokens,
     no_encoder_attn=False,
     decoder_module_list=None,
 ):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     try:
         from fairscale.nn import Pipe
     except ImportError:
         raise ImportError(
             "Please install fairscale with: pip install fairscale")
     if decoder_module_list is None:
         embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
         layers = [
             TransformerDecoderLayer(args, no_encoder_attn)
             for _ in range(args.decoder_layers)
         ]
         decoder_output_layer = TransformerDecoderOutputLayer(
             args, embed_tokens, dictionary)
         decoder_module_list = [embedding_layer
                                ] + layers + [decoder_output_layer]
     self.use_pipeline = getattr(args, "pipeline_decoder_balance",
                                 None) is not None
     if self.use_pipeline:
         decoder_balance = utils.eval_str_list(
             args.pipeline_decoder_balance, type=int)
         decoder_devices = utils.eval_str_list(
             args.pipeline_decoder_devices, type=int)
         assert sum(decoder_balance) == len(decoder_module_list), (
             f"Sum of decoder_balance={decoder_balance} is not equal " +
             f"to num_decoder_modules={len(decoder_module_list)}")
         self.model = Pipe(
             module=nn.Sequential(*decoder_module_list),
             balance=decoder_balance,
             devices=decoder_devices,
             chunks=args.pipeline_chunks,
             checkpoint=args.pipeline_checkpoint,
         )
     else:
         self.embedding_layer = decoder_module_list[0]
         self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1])
         self.decoder_output_layer = decoder_module_list[-1]
コード例 #13
0
 def __init__(
     self,
     args,
     dictionary,
     embed_tokens,
     no_encoder_attn=False,
     decoder_module_list=None,
 ):
     super().__init__(dictionary)
     self.register_buffer("version", torch.Tensor([3]))
     import_pipe()
     self.use_pipeline = decoder_module_list is not None
     if not self.use_pipeline:
         self.embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
         self.decoder_layers = nn.Sequential(*[
             TransformerDecoderLayer(args, no_encoder_attn)
             for _ in range(args.decoder_layers)
         ])
         self.decoder_output_layer = TransformerDecoderOutputLayer(
             args, embed_tokens, dictionary
         )
     else:
         decoder_balance = utils.eval_str_list(
             args.pipeline_decoder_balance, type=int
         )
         decoder_devices = utils.eval_str_list(
             args.pipeline_decoder_devices, type=int
         )
         assert sum(decoder_balance) == len(decoder_module_list), (
             f"Sum of decoder_balance={decoder_balance} is not equal "
             + f"to num_decoder_modules={len(decoder_module_list)}"
         )
         if TORCH_PIPE:
             self.model = Pipe(
                 module=partition_model(nn.Sequential(*decoder_module_list), decoder_balance, decoder_devices),
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
         else:
             self.model = Pipe(
                 module=nn.Sequential(*decoder_module_list),
                 balance=decoder_balance,
                 devices=decoder_devices,
                 chunks=args.pipeline_chunks,
                 checkpoint=args.pipeline_checkpoint,
             )
コード例 #14
0
ファイル: lightconv_lm.py プロジェクト: veralily/fairseq
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if getattr(args, "max_source_positions", None) is None:
            args.max_source_positions = args.tokens_per_sample
        if getattr(args, "max_target_positions", None) is None:
            args.max_target_positions = args.tokens_per_sample

        if args.character_embeddings:
            embed_tokens = CharacterTokenEmbedder(
                task.dictionary,
                eval(args.character_filters),
                args.character_embedding_dim,
                args.decoder_embed_dim,
                args.char_embedder_highway_layers,
            )
        elif args.adaptive_input:
            embed_tokens = AdaptiveInput(
                len(task.dictionary),
                task.dictionary.pad(),
                args.decoder_input_dim,
                args.adaptive_input_factor,
                args.decoder_embed_dim,
                utils.eval_str_list(args.adaptive_input_cutoff, type=int),
            )
        else:
            embed_tokens = Embedding(len(task.dictionary),
                                     args.decoder_input_dim,
                                     task.dictionary.pad())

        if args.tie_adaptive_weights:
            assert args.adaptive_input
            assert args.adaptive_input_factor == args.adaptive_softmax_factor
            assert (args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
                    ), "{} != {}".format(args.adaptive_softmax_cutoff,
                                         args.adaptive_input_cutoff)
            assert args.decoder_input_dim == args.decoder_output_dim

        decoder = LightConvDecoder(
            args,
            task.output_dictionary,
            embed_tokens,
            no_encoder_attn=True,
            final_norm=False,
        )
        return LightConvLanguageModel(decoder)
コード例 #15
0
ファイル: options.py プロジェクト: Tvicker/espresso
def add_distributed_training_args(parser, default_world_size=None):
    group = parser.add_argument_group("Distributed training")
    # fmt: off
    if default_world_size is None:
        default_world_size = max(1, torch.cuda.device_count())
    group.add_argument(
        '--distributed-world-size',
        type=int,
        metavar='N',
        default=default_world_size,
        help='total number of GPUs across all nodes (default: all visible GPUs)'
    )
    group.add_argument('--distributed-rank',
                       default=0,
                       type=int,
                       help='rank of the current worker')
    group.add_argument('--distributed-backend',
                       default='nccl',
                       type=str,
                       help='distributed backend')
    group.add_argument(
        '--distributed-init-method',
        default=None,
        type=str,
        help='typically tcp://hostname:port that will be used to '
        'establish initial connetion')
    group.add_argument(
        '--distributed-port',
        default=-1,
        type=int,
        help='port number (not required if using --distributed-init-method)')
    group.add_argument(
        '--device-id',
        '--local_rank',
        default=0,
        type=int,
        help='which GPU to use (usually configured automatically)')
    group.add_argument(
        '--distributed-no-spawn',
        action='store_true',
        help='do not spawn multiple processes even if multiple GPUs are visible'
    )
    group.add_argument(
        '--distributed-num-procs',
        default=None,
        type=int,
        help='number of processes to spawn (usually configured automatically)')
    # "c10d" is PyTorch's DDP implementation and provides the fastest
    # training. "no_c10d" is a more robust, but slightly slower DDP
    # implementation. Try this if you get warning messages about
    # inconsistent gradients between workers, or if some of your model
    # parameters are not always used.
    group.add_argument('--ddp-backend',
                       default='c10d',
                       type=str,
                       choices=['c10d', 'no_c10d'],
                       help='DistributedDataParallel backend')
    group.add_argument('--bucket-cap-mb',
                       default=25,
                       type=int,
                       metavar='MB',
                       help='bucket size for reduction')
    group.add_argument(
        '--fix-batches-to-gpus',
        action='store_true',
        help='don\'t shuffle batches between GPUs; this reduces overall '
        'randomness and may affect precision but avoids the cost of '
        're-reading the data')
    group.add_argument(
        '--find-unused-parameters',
        default=False,
        action='store_true',
        help='disable unused parameter detection (not applicable to '
        'no_c10d ddp-backend')
    group.add_argument('--fast-stat-sync',
                       default=False,
                       action='store_true',
                       help='[deprecated] this is now defined per Criterion')
    group.add_argument(
        '--broadcast-buffers',
        default=False,
        action='store_true',
        help='Copy non-trainable parameters between GPUs, such as '
        'batchnorm population statistics')

    group.add_argument('--distributed-wrapper',
                       default='DDP',
                       type=str,
                       choices=['DDP', 'SlowMo'],
                       help='DistributedDataParallel backend')
    # Add arguments for SlowMo - these will be used when SlowMo is enabled via above
    group.add_argument(
        '--slowmo-momentum',
        default=None,
        type=float,
        help='SlowMo momentum term; by default use 0.0 for 16 GPUs, '
        '0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs')
    group.add_argument('--slowmo-algorithm',
                       default='LocalSGD',
                       choices=['LocalSGD', 'SGP'],
                       help='whether to use LocalSGD or SGP')
    group.add_argument('--localsgd-frequency',
                       default=3,
                       type=int,
                       help='Local SGD allreduce frequency')
    group.add_argument(
        '--nprocs-per-node',
        type=int,
        metavar='N',
        default=max(1, torch.cuda.device_count()),
        help=
        'number of GPUs in each node. An allreduce operation across GPUs in '
        'a node is very fast. Hence, we do allreduce across GPUs in a node, '
        'and gossip across different nodes')
    # Pipeline Parallel Arguments
    group.add_argument(
        '--pipeline-model-parallel',
        default=False,
        action='store_true',
        help='if set, use pipeline model parallelism across GPUs')
    group.add_argument(
        '--pipeline-balance',
        metavar='N1,N2,...,N_K',
        type=lambda x: eval_str_list(x, type=int),
        help='partition the model into N_K pieces, where each piece '
        'contains N_i layers. The sum(args.pipeline_balance) '
        'should equal the total number of layers in the model')
    group.add_argument(
        '--pipeline-devices',
        metavar='N1,N2,...,N_K',
        type=lambda x: eval_str_list(x, type=int),
        help='a list of device indices indicating which device to place '
        'each of the N_K partitions. The length of this list should '
        'equal the length of the --pipeline-balance argument')
    group.add_argument('--pipeline-chunks',
                       type=int,
                       metavar='N',
                       help='microbatch count for pipeline model parallelism')
    group.add_argument(
        '--pipeline-checkpoint',
        type=str,
        metavar='STR',
        choices=['always', 'never', 'except_last'],
        default='never',
        help='checkpointing mode for pipeline model parallelism')
    # Add argument for ZeRO sharding of OptimizerState(os), gradients(g) and parameters(p)
    group.add_argument('--zero-sharding',
                       default='none',
                       type=str,
                       choices=['none', 'os'],
                       help='ZeRO sharding')
    # fmt: on
    return group
コード例 #16
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        max_source_positions = getattr(
            args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
        )
        max_target_positions = getattr(
            args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
        )

        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        # separate decoder input embeddings
        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path,
                task.target_dictionary,
                args.decoder_embed_dim,
            )
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
            args.decoder_embed_dim != args.decoder_out_embed_dim
        ):
            raise ValueError(
                "--share-decoder-input-output-embed requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim"
            )

        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        out_channels = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_channels, type=int
        )
        kernel_sizes = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_kernel_sizes, type=int
        )
        strides = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_strides, type=int
        )
        logger.info(
            "input feature dimension: {}, channels: {}".format(
                task.feat_dim, task.feat_in_channels
            )
        )
        assert task.feat_dim % task.feat_in_channels == 0
        conv_layers = (
            ConvBNReLU(
                out_channels,
                kernel_sizes,
                strides,
                in_channels=task.feat_in_channels,
            )
            if out_channels is not None
            else None
        )

        rnn_encoder_input_size = task.feat_dim // task.feat_in_channels
        if conv_layers is not None:
            for stride in strides:
                if isinstance(stride, (list, tuple)):
                    assert len(stride) > 0
                    s = stride[1] if len(stride) > 1 else stride[0]
                else:
                    assert isinstance(stride, int)
                    s = stride
                rnn_encoder_input_size = (rnn_encoder_input_size + s - 1) // s
            rnn_encoder_input_size *= out_channels[-1]
        else:
            rnn_encoder_input_size = task.feat_dim

        if args.encoder_multilayer_rnn_as_single_module and args.encoder_rnn_residual:
            args.encoder_rnn_residual = False
            logger.info(
                "--encoder-rnn-residual is set to False when --encoder-multilayer-rnn-as-single-module=True"
            )

        scheduled_sampling_rate_scheduler = ScheduledSamplingRateScheduler(
            args.scheduled_sampling_probs,
            args.start_scheduled_sampling_epoch,
        )

        encoder = SpeechLSTMEncoder(
            pre_encoder=conv_layers,
            input_size=rnn_encoder_input_size,
            hidden_size=args.encoder_rnn_hidden_size,
            num_layers=args.encoder_rnn_layers,
            dropout_in=args.encoder_rnn_dropout_in,
            dropout_out=args.encoder_rnn_dropout_out,
            bidirectional=args.encoder_rnn_bidirectional,
            residual=args.encoder_rnn_residual,
            src_bucketed=(getattr(task.cfg, "num_batch_buckets", 0) > 0),
            max_source_positions=max_source_positions,
            multilayer_rnn_as_single_module=args.encoder_multilayer_rnn_as_single_module,
        )
        decoder = SpeechLSTMDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            encoder_output_units=encoder.output_units,
            attn_type=args.attention_type,
            attn_dim=args.attention_dim,
            need_attn=args.need_attention,
            residual=args.decoder_rnn_residual,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
                if args.criterion_name == "adaptive_loss"
                else None
            ),
            max_target_positions=max_target_positions,
            scheduled_sampling_rate_scheduler=scheduled_sampling_rate_scheduler,
        )
        pretrained_lm = None
        if args.pretrained_lm_checkpoint:
            logger.info(
                "loading pretrained LM from {}".format(args.pretrained_lm_checkpoint)
            )
            pretrained_lm = checkpoint_utils.load_model_ensemble(
                args.pretrained_lm_checkpoint, task=task
            )[0][0]
            pretrained_lm.make_generation_fast_()
            # freeze pretrained model
            for param in pretrained_lm.parameters():
                param.requires_grad = False
        return cls(encoder, decoder, pretrained_lm)
コード例 #17
0
ファイル: rnn.py プロジェクト: clefourrier/fairseq
    def build_model(cls, cfg: RNNModelConfig, task):
        """Build a new model instance."""
        if cfg.encoder_layers != cfg.decoder_layers:
            raise ValueError("--encoder-layers must match --decoder-layers")

        max_source_positions = getattr(
            cfg, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
        )
        max_target_positions = getattr(
            cfg, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
        )

        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if cfg.encoder_embed_path:
            pretrained_encoder_embed = load_pretrained_embedding_from_file(
                cfg.encoder_embed_path, task.source_dictionary, cfg.encoder_embed_dim
            )
        else:
            num_embeddings = len(task.source_dictionary)
            pretrained_encoder_embed = torch.nn.Embedding(
                num_embeddings, cfg.encoder_embed_dim, task.source_dictionary.pad()
            )

        if cfg.share_all_embeddings:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError("--share-all-embeddings requires a joint dictionary")
            if cfg.decoder_embed_path and (
                cfg.decoder_embed_path != cfg.encoder_embed_path
            ):
                raise ValueError(
                    "--share-all-embed not compatible with --decoder-embed-path"
                )
            if cfg.encoder_embed_dim != cfg.decoder_embed_dim:
                raise ValueError(
                    "--share-all-embeddings requires --encoder-embed-dim to "
                    "match --decoder-embed-dim"
                )
            pretrained_decoder_embed = pretrained_encoder_embed
            cfg.share_decoder_input_output_embed = True
        else:
            # separate decoder input embeddings
            pretrained_decoder_embed = None
            if cfg.decoder_embed_path:
                pretrained_decoder_embed = load_pretrained_embedding_from_file(
                    cfg.decoder_embed_path,
                    task.target_dictionary,
                    cfg.decoder_embed_dim,
                )
        # one last double check of parameter combinations
        if cfg.share_decoder_input_output_embed and (
            cfg.decoder_embed_dim != cfg.decoder_out_embed_dim
        ):
            raise ValueError(
                "--share-decoder-input-output-embeddings requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim"
            )

        if cfg.encoder_freeze_embed:
            pretrained_encoder_embed.weight.requires_grad = False
        if cfg.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        encoder = RNNEncoder(
            rnn_type=cfg.rnn_type,
            dictionary=task.source_dictionary,
            embed_dim=cfg.encoder_embed_dim,
            hidden_size=cfg.encoder_hidden_size,
            num_layers=cfg.encoder_layers,
            dropout_in=(cfg.encoder_dropout_in if cfg.encoder_dropout_in >= 0 else cfg.dropout),
            dropout_out=(cfg.encoder_dropout_out if cfg.encoder_dropout_out >= 0 else cfg.dropout),
            bidirectional=cfg.encoder_bidirectional,
            pretrained_embed=pretrained_encoder_embed,
            max_source_positions=max_source_positions,
        )
        uses_attention = getattr(cfg, 'attention_type', "none") != "none"
        attention_type = getattr(cfg, 'attention_type', "luong-dot") if uses_attention else None
        decoder = RNNDecoder(
            rnn_type=cfg.rnn_type,
            dictionary=task.target_dictionary,
            embed_dim=cfg.decoder_embed_dim,
            hidden_size=cfg.decoder_hidden_size,
            out_embed_dim=cfg.decoder_out_embed_dim,
            num_layers=cfg.decoder_layers,
            dropout_in=(cfg.decoder_dropout_in if cfg.decoder_dropout_in >= 0 else cfg.dropout),
            dropout_out=(cfg.decoder_dropout_out if cfg.decoder_dropout_out >= 0 else cfg.dropout),
            attention=uses_attention,
            attention_type=attention_type,
            encoder_output_units=encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=cfg.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(
                utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int)
                if cfg.criterion == "adaptive_loss"
                else None
            ),
            max_target_positions=max_target_positions,
            residuals=False,
        )
        return cls(encoder, decoder)
コード例 #18
0
def infer_init_method(args, force_distributed=False):
    if args.distributed_init_method is not None or getattr(args, 'tpu', False):
        return

    if args.pipeline_model_parallel:
        balance_exists = args.pipeline_balance is not None or \
            args.pipeline_encoder_balance is not None or \
            args.pipeline_decoder_balance is not None
        devices_exist = args.pipeline_devices is not None or \
            args.pipeline_encoder_devices is not None or \
            args.pipeline_decoder_devices is not None
        if not balance_exists:
            raise ValueError(
                '--pipeline-balance is currently required for pipeline model parallelism'
            )
        if not devices_exist:
            raise ValueError(
                '--pipeline-devices is currently required for pipeline model parallelism'
            )

        args.pipeline_balance = utils.eval_str_list(args.pipeline_balance,
                                                    type=int)
        if args.pipeline_devices is not None:
            args.pipeline_devices = utils.eval_str_list(args.pipeline_devices,
                                                        type=int)
            num_pipeline_devices = len(set(args.pipeline_devices))
        else:
            args.pipeline_encoder_devices = utils.eval_str_list(
                args.pipeline_encoder_devices, type=int)
            args.pipeline_decoder_devices = utils.eval_str_list(
                args.pipeline_decoder_devices, type=int)
            num_pipeline_devices = len(
                set(args.pipeline_encoder_devices +
                    args.pipeline_decoder_devices))
        gpus_per_node = torch.cuda.device_count()
        assert gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0, (
            'the number of unique device IDs in --pipeline-devices must evenly divide '
            'the number of GPUs per node (multi-node pipelining is not yet supported)'
        )
        num_pipelines_per_node = gpus_per_node // num_pipeline_devices

    # support torch.distributed.launch
    if all(key in os.environ
           for key in ['MASTER_ADDR', 'MASTER_PORT', 'WORLD_SIZE', 'RANK']):
        args.distributed_init_method = 'env://'
        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
        args.distributed_rank = int(os.environ['RANK'])
        # processes are created by torch.distributed.launch
        args.distributed_no_spawn = True

    # we can determine the init method automatically for Slurm
    elif args.distributed_port > 0:
        node_list = os.environ.get('SLURM_STEP_NODELIST')
        if node_list is None:
            node_list = os.environ.get('SLURM_JOB_NODELIST')
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(
                    ['scontrol', 'show', 'hostnames', node_list])
                args.distributed_init_method = 'tcp://{host}:{port}'.format(
                    host=hostnames.split()[0].decode('utf-8'),
                    port=args.distributed_port,
                )
                nnodes = int(os.environ.get('SLURM_NNODES'))
                ntasks_per_node = os.environ.get('SLURM_NTASKS_PER_NODE')
                if ntasks_per_node is not None:
                    ntasks_per_node = int(ntasks_per_node)
                else:
                    ntasks = int(os.environ.get('SLURM_NTASKS'))
                    nnodes = int(os.environ.get('SLURM_NNODES'))
                    assert ntasks % nnodes == 0
                    ntasks_per_node = int(ntasks / nnodes)
                if ntasks_per_node == 1:
                    gpus_per_node = torch.cuda.device_count()
                    node_id = int(os.environ.get('SLURM_NODEID'))
                    args.distributed_rank = node_id * gpus_per_node
                    args.distributed_world_size = nnodes * gpus_per_node
                elif args.pipeline_model_parallel:
                    assert ntasks_per_node == num_pipelines_per_node, (
                        'SLURM --ntasks-per-node must match number of pipelines per '
                        'node (={})'.format(num_pipelines_per_node))
                    args.distributed_no_spawn = True
                    # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
                    # the first node, [1, 2] on the second node, etc. This
                    # matches torch.distributed.launch.
                    node_id = int(os.environ.get('SLURM_NODEID'))
                    local_id = int(os.environ.get('SLURM_LOCALID'))
                    args.distributed_rank = node_id * num_pipelines_per_node + local_id
                    # In the above example, device_id will always be in [0, 1],
                    # which also matches torch.distributed.launch.
                    args.device_id = local_id
                    # We also want to set distributed_world_size to be the total
                    # number of pipelines across all nodes.
                    args.distributed_world_size = nnodes * num_pipelines_per_node
                else:
                    assert ntasks_per_node == args.distributed_world_size // nnodes
                    args.distributed_no_spawn = True
                    args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
                    args.device_id = int(os.environ.get('SLURM_LOCALID'))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError:  # Slurm is not installed
                pass

    elif args.distributed_world_size > 1 or force_distributed:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)

    if args.pipeline_model_parallel:
        if not args.distributed_no_spawn:
            # When distributed_no_spawn is False, we expect distributed_rank and
            # distributed_world_size to be based on the total number of GPUs, so
            # we need to correct them to be based on the number of pipelines.
            assert args.distributed_world_size % num_pipeline_devices == 0
            args.distributed_world_size = args.distributed_world_size // num_pipeline_devices
            # In the case of 4-way MP on nodes with 8 GPUs, we want
            # distributed_rank to be the starting GPU index for each pipeline
            # i.e., 0, 2, ...
            assert args.distributed_rank % gpus_per_node == 0
            assert args.distributed_rank % num_pipeline_devices == 0
            args.distributed_rank = args.distributed_rank // num_pipeline_devices
            # launch one process per pipeline
            args.distributed_num_procs = num_pipelines_per_node

        # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
        # and 4, indicating the starting device IDs for each pipeline
        args.device_id *= num_pipeline_devices

        if args.device_id > 0:
            # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
            # GPU node), we need to adjust pipeline_devices accordingly
            logger.debug("setting CUDA device={} on rank {}".format(
                args.device_id, args.distributed_rank))
            torch.cuda.set_device(args.device_id)
            args.pipeline_devices = [
                args.device_id + d for d in args.pipeline_devices
            ]
            logger.info(
                "setting pipeline_devices={} on rank {}".format(
                    args.pipeline_devices, args.distributed_rank), )
    elif not args.distributed_no_spawn:
        args.distributed_num_procs = min(
            torch.cuda.device_count(),
            args.distributed_world_size,
        )
コード例 #19
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--dropout',
                         default=0.1,
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         default=0.,
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--relu-dropout',
                         default=0.,
                         type=float,
                         metavar='D',
                         help='dropout probability after ReLU in FFN')
     parser.add_argument('--input-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability of the inputs')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-output-dim',
                         type=int,
                         metavar='N',
                         help='decoder output dimension')
     parser.add_argument('--decoder-input-dim',
                         type=int,
                         metavar='N',
                         help='decoder input dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument(
         '--decoder-attention-heads',
         type=int,
         metavar='N',
         help='num decoder attention heads or LightConv/DynamicConv heads')
     parser.add_argument('--decoder-normalize-before',
                         default=False,
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion')
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--adaptive-softmax-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--share-decoder-input-output-embed',
                         default=False,
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument(
         '--character-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, uses character embedding convolutions to produce token embeddings'
     )
     parser.add_argument(
         '--character-filters',
         type=str,
         metavar='LIST',
         default=
         '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
         help='size of character embeddings')
     parser.add_argument('--character-embedding-dim',
                         type=int,
                         metavar='N',
                         default=4,
                         help='size of character embeddings')
     parser.add_argument(
         '--char-embedder-highway-layers',
         type=int,
         metavar='N',
         default=2,
         help='number of highway layers for character token embeddder')
     parser.add_argument('--adaptive-input',
                         default=False,
                         action='store_true',
                         help='if set, uses adaptive input')
     parser.add_argument('--adaptive-input-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--adaptive-input-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive input cutoff points.')
     parser.add_argument(
         '--tie-adaptive-weights',
         action='store_true',
         help=
         'if set, ties the weights of adaptive softmax and adaptive input')
     parser.add_argument(
         '--tie-adaptive-proj',
         action='store_true',
         help=
         'if set, ties the projection weights of adaptive softmax and adaptive input'
     )
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         '--decoder-kernel-size-list',
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")')
     parser.add_argument('--decoder-glu',
                         type=utils.eval_bool,
                         help='glu after in proj')
     parser.add_argument('--decoder-conv-type',
                         default='dynamic',
                         type=str,
                         choices=['dynamic', 'lightweight'],
                         help='type of convolution')
     parser.add_argument('--weight-softmax',
                         default=True,
                         type=utils.eval_bool)
     parser.add_argument('--weight-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for conv weights')
コード例 #20
0
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False):
    if cfg.distributed_init_method is not None or cfg.tpu:
        return

    if cfg.pipeline_model_parallel:
        balance_exists = (cfg.pipeline_balance is not None
                          or cfg.pipeline_encoder_balance is not None
                          or cfg.pipeline_decoder_balance is not None)
        devices_exist = (cfg.pipeline_devices is not None
                         or cfg.pipeline_encoder_devices is not None
                         or cfg.pipeline_decoder_devices is not None)
        if not balance_exists:
            raise ValueError(
                "--pipeline-balance is currently required for pipeline model parallelism"
            )
        if not devices_exist:
            raise ValueError(
                "--pipeline-devices is currently required for pipeline model parallelism"
            )

        cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance,
                                                   type=int)
        if cfg.pipeline_devices is not None:
            cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices,
                                                       type=int)
            num_pipeline_devices = len(set(cfg.pipeline_devices))
        else:
            cfg.pipeline_encoder_devices = utils.eval_str_list(
                cfg.pipeline_encoder_devices, type=int)
            cfg.pipeline_decoder_devices = utils.eval_str_list(
                cfg.pipeline_decoder_devices, type=int)
            num_pipeline_devices = len(
                set(cfg.pipeline_encoder_devices +
                    cfg.pipeline_decoder_devices))
        gpus_per_node = torch.cuda.device_count()
        assert (
            gpus_per_node >= num_pipeline_devices
            and gpus_per_node % num_pipeline_devices == 0
        ), ("the number of unique device IDs in --pipeline-devices must evenly divide "
            "the number of GPUs per node (multi-node pipelining is not yet supported)"
            )
        num_pipelines_per_node = gpus_per_node // num_pipeline_devices

    # support torch.distributed.launch
    if all(key in os.environ
           for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]):
        cfg.distributed_init_method = "env://"
        cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
        cfg.distributed_rank = int(os.environ["RANK"])
        # processes are created by torch.distributed.launch
        cfg.distributed_no_spawn = True

    # we can determine the init method automatically for Slurm
    elif cfg.distributed_port > 0:
        node_list = os.environ.get("SLURM_STEP_NODELIST")
        if node_list is None:
            node_list = os.environ.get("SLURM_JOB_NODELIST")
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(
                    ["scontrol", "show", "hostnames", node_list])
                cfg.distributed_init_method = "tcp://{host}:{port}".format(
                    host=hostnames.split()[0].decode("utf-8"),
                    port=cfg.distributed_port,
                )
                nnodes = int(os.environ.get("SLURM_NNODES"))
                ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
                if ntasks_per_node is not None:
                    ntasks_per_node = int(ntasks_per_node)
                else:
                    ntasks = int(os.environ.get("SLURM_NTASKS"))
                    nnodes = int(os.environ.get("SLURM_NNODES"))
                    assert ntasks % nnodes == 0
                    ntasks_per_node = int(ntasks / nnodes)
                if ntasks_per_node == 1:
                    gpus_per_node = torch.cuda.device_count()
                    node_id = int(os.environ.get("SLURM_NODEID"))
                    cfg.distributed_rank = node_id * gpus_per_node
                    cfg.distributed_world_size = nnodes * gpus_per_node
                elif cfg.pipeline_model_parallel:
                    assert ntasks_per_node == num_pipelines_per_node, (
                        "SLURM --ntasks-per-node must match number of pipelines per "
                        "node (={})".format(num_pipelines_per_node))
                    cfg.distributed_no_spawn = True
                    # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
                    # the first node, [1, 2] on the second node, etc. This
                    # matches torch.distributed.launch.
                    node_id = int(os.environ.get("SLURM_NODEID"))
                    local_id = int(os.environ.get("SLURM_LOCALID"))
                    cfg.distributed_rank = node_id * num_pipelines_per_node + local_id
                    # In the above example, device_id will always be in [0, 1],
                    # which also matches torch.distributed.launch.
                    cfg.device_id = local_id
                    # We also want to set distributed_world_size to be the total
                    # number of pipelines across all nodes.
                    cfg.distributed_world_size = nnodes * num_pipelines_per_node
                else:
                    assert ntasks_per_node == cfg.distributed_world_size // nnodes
                    cfg.distributed_no_spawn = True
                    cfg.distributed_rank = int(os.environ.get("SLURM_PROCID"))
                    cfg.device_id = int(os.environ.get("SLURM_LOCALID"))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError:  # Slurm is not installed
                pass

    elif cfg.distributed_world_size > 1 or force_distributed:
        # fallback for single node with multiple GPUs
        assert (
            cfg.distributed_world_size <= torch.cuda.device_count()
        ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices"
        port = random.randint(10000, 20000)
        cfg.distributed_init_method = "tcp://localhost:{port}".format(
            port=port)

    if cfg.pipeline_model_parallel:
        if not cfg.distributed_no_spawn:
            # When distributed_no_spawn is False, we expect distributed_rank and
            # distributed_world_size to be based on the total number of GPUs, so
            # we need to correct them to be based on the number of pipelines.
            assert cfg.distributed_world_size % num_pipeline_devices == 0
            cfg.distributed_world_size = (cfg.distributed_world_size //
                                          num_pipeline_devices)
            # In the case of 4-way MP on nodes with 8 GPUs, we want
            # distributed_rank to be the starting GPU index for each pipeline
            # i.e., 0, 2, ...
            assert cfg.distributed_rank % gpus_per_node == 0
            assert cfg.distributed_rank % num_pipeline_devices == 0

            with open_dict(cfg):
                cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices
                # launch one process per pipeline
                cfg.distributed_num_procs = num_pipelines_per_node

        # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
        # and 4, indicating the starting device IDs for each pipeline
        cfg.device_id *= num_pipeline_devices

        if cfg.device_id > 0:
            # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
            # GPU node), we need to adjust pipeline_devices accordingly
            logger.debug("setting CUDA device={} on rank {}".format(
                cfg.device_id, cfg.distributed_rank))
            torch.cuda.set_device(cfg.device_id)
            with open_dict(cfg):
                cfg.pipeline_devices = [
                    cfg.device_id + d for d in cfg.pipeline_devices
                ]
            logger.info("setting pipeline_devices={} on rank {}".format(
                cfg.pipeline_devices, cfg.distributed_rank))
    elif not cfg.distributed_no_spawn:
        with open_dict(cfg):
            cfg.distributed_num_procs = min(torch.cuda.device_count(),
                                            cfg.distributed_world_size)
コード例 #21
0
    def __init__(
        self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True
    ):
        super().__init__(dictionary)
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )

        self.embed_positions = (
            PositionalEmbedding(
                args.max_target_positions,
                embed_dim,
                padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                LightConvDecoderLayer(
                    args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]
                )
                for i in range(args.decoder_layers)
            ]
        )

        self.adaptive_softmax = None

        self.project_out_dim = (
            Linear(embed_dim, output_embed_dim, bias=False)
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim)
            )
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
コード例 #22
0
ファイル: lightconv_lm.py プロジェクト: veralily/fairseq
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument(
         "--dropout",
         default=0.1,
         type=float,
         metavar="D",
         help="dropout probability",
     )
     parser.add_argument(
         "--attention-dropout",
         default=0.0,
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--relu-dropout",
         default=0.0,
         type=float,
         metavar="D",
         help="dropout probability after ReLU in FFN",
     )
     parser.add_argument(
         "--input-dropout",
         type=float,
         metavar="D",
         help="dropout probability of the inputs",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-output-dim",
         type=int,
         metavar="N",
         help="decoder output dimension",
     )
     parser.add_argument("--decoder-input-dim",
                         type=int,
                         metavar="N",
                         help="decoder input dimension")
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument("--decoder-layers",
                         type=int,
                         metavar="N",
                         help="num decoder layers")
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         default=False,
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--adaptive-softmax-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive softmax cutoff points. "
         "Must be used with adaptive_loss criterion",
     )
     parser.add_argument(
         "--adaptive-softmax-dropout",
         type=float,
         metavar="D",
         help="sets adaptive softmax dropout for the tail projections",
     )
     parser.add_argument(
         "--adaptive-softmax-factor",
         type=float,
         metavar="N",
         help="adaptive input factor",
     )
     parser.add_argument(
         "--no-token-positional-embeddings",
         default=False,
         action="store_true",
         help=
         "if set, disables positional embeddings (outside self attention)",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         default=False,
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--character-embeddings",
         default=False,
         action="store_true",
         help=
         "if set, uses character embedding convolutions to produce token embeddings",
     )
     parser.add_argument(
         "--character-filters",
         type=str,
         metavar="LIST",
         default=
         "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
         help="size of character embeddings",
     )
     parser.add_argument(
         "--character-embedding-dim",
         type=int,
         metavar="N",
         default=4,
         help="size of character embeddings",
     )
     parser.add_argument(
         "--char-embedder-highway-layers",
         type=int,
         metavar="N",
         default=2,
         help="number of highway layers for character token embeddder",
     )
     parser.add_argument(
         "--adaptive-input",
         default=False,
         action="store_true",
         help="if set, uses adaptive input",
     )
     parser.add_argument(
         "--adaptive-input-factor",
         type=float,
         metavar="N",
         help="adaptive input factor",
     )
     parser.add_argument(
         "--adaptive-input-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive input cutoff points.",
     )
     parser.add_argument(
         "--tie-adaptive-weights",
         action="store_true",
         help=
         "if set, ties the weights of adaptive softmax and adaptive input",
     )
     parser.add_argument(
         "--tie-adaptive-proj",
         action="store_true",
         help=
         "if set, ties the projection weights of adaptive softmax and adaptive input",
     )
     parser.add_argument(
         "--decoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the decoder",
     )
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         "--decoder-kernel-size-list",
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")',
     )
     parser.add_argument("--decoder-glu",
                         type=utils.eval_bool,
                         help="glu after in proj")
     parser.add_argument(
         "--decoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument("--weight-softmax",
                         default=True,
                         type=utils.eval_bool)
     parser.add_argument(
         "--weight-dropout",
         type=float,
         metavar="D",
         help="dropout probability for conv weights",
     )
コード例 #23
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        if args.encoder_layers != args.decoder_layers:
            raise ValueError("--encoder-layers must match --decoder-layers")

        max_source_positions = getattr(
            args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
        )
        max_target_positions = getattr(
            args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
        )

        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.encoder_embed_path:
            pretrained_encoder_embed = load_pretrained_embedding_from_file(
                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
            )
        else:
            num_embeddings = len(task.source_dictionary)
            pretrained_encoder_embed = Embedding(
                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
            )

        if args.share_all_embeddings:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError("--share-all-embeddings requires a joint dictionary")
            if args.decoder_embed_path and (
                args.decoder_embed_path != args.encoder_embed_path
            ):
                raise ValueError(
                    "--share-all-embed not compatible with --decoder-embed-path"
                )
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    "--share-all-embeddings requires --encoder-embed-dim to "
                    "match --decoder-embed-dim"
                )
            pretrained_decoder_embed = pretrained_encoder_embed
            args.share_decoder_input_output_embed = True
        else:
            # separate decoder input embeddings
            pretrained_decoder_embed = None
            if args.decoder_embed_path:
                pretrained_decoder_embed = load_pretrained_embedding_from_file(
                    args.decoder_embed_path,
                    task.target_dictionary,
                    args.decoder_embed_dim,
                )
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
            args.decoder_embed_dim != args.decoder_out_embed_dim
        ):
            raise ValueError(
                "--share-decoder-input-output-embeddings requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim"
            )

        if args.encoder_freeze_embed:
            pretrained_encoder_embed.weight.requires_grad = False
        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        encoder = LSTMEncoder(
            dictionary=task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            hidden_size=args.encoder_hidden_size,
            num_layers=args.encoder_layers,
            dropout_in=args.encoder_dropout_in,
            dropout_out=args.encoder_dropout_out,
            bidirectional=args.encoder_bidirectional,
            pretrained_embed=pretrained_encoder_embed,
            max_source_positions=max_source_positions,
        )
        decoder = LSTMDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=utils.eval_bool(args.decoder_attention),
            encoder_output_units=encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
                if args.criterion == "adaptive_loss"
                else None
            ),
            max_target_positions=max_target_positions,
            residuals=False,
        )
        return cls(encoder, decoder)
コード例 #24
0
ファイル: lstm_lm.py プロジェクト: eastonYi/fairseq
    def build_model(cls, args, task, dictionary=None):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        if getattr(args, 'max_target_positions', None) is not None:
            max_target_positions = args.max_target_positions
        else:
            max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path,
                task.target_dictionary,
                args.decoder_embed_dim
            )

        if args.share_decoder_input_output_embed:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError('--share-decoder-input-output-embeddings requires a joint dictionary')

            if args.decoder_embed_dim != args.decoder_out_embed_dim:
                raise ValueError(
                    '--share-decoder-input-output-embeddings requires '
                    '--decoder-embed-dim to match --decoder-out-embed-dim'
                    )

        decoder = LSTMDecoder(
            dictionary=dictionary if dictionary else task.dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=False,  # decoder-only language model doesn't support attention
            encoder_output_units=0,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
                if args.criterion == 'adaptive_loss' else None
            ),
            max_target_positions=max_target_positions,
            residuals=args.residuals
        )

        if getattr(args, "lm_path", None):
            # args.lm_path = '../libri/wav2vec2_small.pt'
            print('load LSTM_LM from {}'.format(args.lm_path))
            state = checkpoint_utils.load_checkpoint_to_cpu(args.lm_path)
            lm_args = state["args"]
            lm_args.data = args.data
            assert getattr(lm_args, "lm_path", None) is None

            task = tasks.setup_task(lm_args)
            decoder = task.build_model(lm_args)
            print('restore LSTM_LM from {}'.format(args.lm_path))
            decoder.load_state_dict(state["model"], strict=True)
        decoder.dim_output = len(task.dictionary)

        return cls(decoder)
コード例 #25
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument(
            "--dropout", type=float, metavar="D", help="dropout probability"
        )
        parser.add_argument(
            "--attention-dropout",
            type=float,
            metavar="D",
            help="dropout probability for attention weights",
        )
        parser.add_argument(
            "--relu-dropout",
            type=float,
            metavar="D",
            help="dropout probability after ReLU in FFN",
        )
        parser.add_argument(
            "--input-dropout",
            type=float,
            metavar="D",
            help="dropout probability of the inputs",
        )
        parser.add_argument(
            "--encoder-embed-path",
            type=str,
            metavar="STR",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-conv-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="encoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
        )
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
            help="num encoder attention heads or LightConv/DynamicConv heads",
        )
        parser.add_argument(
            "--encoder-normalize-before",
            action="store_true",
            help="apply layernorm before each encoder block",
        )
        parser.add_argument(
            "--encoder-learned-pos",
            action="store_true",
            help="use learned positional embeddings in the encoder",
        )
        parser.add_argument(
            "--decoder-embed-path",
            type=str,
            metavar="STR",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-conv-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-ffn-embed-dim",
            type=int,
            metavar="N",
            help="decoder embedding dimension for FFN",
        )
        parser.add_argument(
            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
        )
        parser.add_argument(
            "--decoder-attention-heads",
            type=int,
            metavar="N",
            help="num decoder attention heads or LightConv/DynamicConv heads",
        )
        parser.add_argument(
            "--decoder-learned-pos",
            action="store_true",
            help="use learned positional embeddings in the decoder",
        )
        parser.add_argument(
            "--decoder-normalize-before",
            action="store_true",
            help="apply layernorm before each decoder block",
        )
        parser.add_argument(
            "--share-decoder-input-output-embed",
            action="store_true",
            help="share decoder input and output embeddings",
        )
        parser.add_argument(
            "--share-all-embeddings",
            action="store_true",
            help="share encoder, decoder and output embeddings"
            " (requires shared dictionary and embed dim)",
        )
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion",
        ),
        parser.add_argument(
            "--adaptive-softmax-dropout",
            type=float,
            metavar="D",
            help="sets adaptive softmax dropout for the tail projections",
        )

        """LightConv and DynamicConv arguments"""
        parser.add_argument(
            "--encoder-kernel-size-list",
            type=lambda x: utils.eval_str_list(x, int),
            help='list of kernel size (default: "[3,7,15,31,31,31,31]")',
        )
        parser.add_argument(
            "--decoder-kernel-size-list",
            type=lambda x: utils.eval_str_list(x, int),
            help='list of kernel size (default: "[3,7,15,31,31,31]")',
        )
        parser.add_argument(
            "--encoder-glu", type=utils.eval_bool, help="glu after in proj"
        )
        parser.add_argument(
            "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
        )
        parser.add_argument(
            "--encoder-conv-type",
            default="dynamic",
            type=str,
            choices=["dynamic", "lightweight"],
            help="type of convolution",
        )
        parser.add_argument(
            "--decoder-conv-type",
            default="dynamic",
            type=str,
            choices=["dynamic", "lightweight"],
            help="type of convolution",
        )
        parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
        parser.add_argument(
            "--weight-dropout",
            type=float,
            metavar="D",
            help="dropout probability for conv weights",
        )
コード例 #26
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout", type=float, metavar="D",
                            help="dropout probability")
        parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s out channels")
        parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s kernel sizes")
        parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s strides")
        parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N",
                            help="encoder rnn\'s hidden size")
        parser.add_argument("--encoder-rnn-layers", type=int, metavar="N",
                            help="number of rnn encoder layers")
        parser.add_argument("--encoder-rnn-bidirectional",
                            type=lambda x: utils.eval_bool(x),
                            help="make all rnn layers of encoder bidirectional")
        parser.add_argument("--encoder-rnn-residual",
                            type=lambda x: utils.eval_bool(x),
                            help="create residual connections for rnn encoder "
                            "layers (starting from the 2nd layer), i.e., the actual "
                            "output of such layer is the sum of its input and output")
        parser.add_argument("--decoder-embed-dim", type=int, metavar="N",
                            help="decoder embedding dimension")
        parser.add_argument("--decoder-embed-path", type=str, metavar="STR",
                            help="path to pre-trained decoder embedding")
        parser.add_argument("--decoder-freeze-embed", action="store_true",
                            help="freeze decoder embeddings")
        parser.add_argument("--decoder-hidden-size", type=int, metavar="N",
                            help="decoder hidden size")
        parser.add_argument("--decoder-layers", type=int, metavar="N",
                            help="number of decoder layers")
        parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N",
                            help="decoder output embedding dimension")
        parser.add_argument("--decoder-rnn-residual",
                            type=lambda x: utils.eval_bool(x),
                            help="create residual connections for rnn decoder "
                            "layers (starting from the 2nd layer), i.e., the actual "
                            "output of such layer is the sum of its input and output")
        parser.add_argument("--attention-type", type=str, metavar="STR",
                            choices=["bahdanau", "luong"],
                            help="attention type")
        parser.add_argument("--attention-dim", type=int, metavar="N",
                            help="attention dimension")
        parser.add_argument("--need-attention", action="store_true",
                            help="need to return attention tensor for the caller")
        parser.add_argument("--adaptive-softmax-cutoff", metavar="EXPR",
                            help="comma separated list of adaptive softmax cutoff points. "
                                 "Must be used with adaptive_loss criterion")
        parser.add_argument("--share-decoder-input-output-embed",
                            type=lambda x: utils.eval_bool(x),
                            help="share decoder input and output embeddings")
        parser.add_argument("--pretrained-lm-checkpoint", type=str, metavar="STR",
                            help="path to load checkpoint from pretrained language model(LM), "
                            "which will be present and kept fixed during training.")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D",
                            help="dropout probability for encoder rnn\'s input")
        parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D",
                            help="dropout probability for encoder rnn\'s output")
        parser.add_argument("--decoder-dropout-in", type=float, metavar="D",
                            help="dropout probability for decoder input embedding")
        parser.add_argument("--decoder-dropout-out", type=float, metavar="D",
                            help="dropout probability for decoder output")

        # Scheduled sampling options
        parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p),
                            metavar="P_1,P_2,...,P_N", default=[1.0],
                            help="scheduled sampling probabilities of sampling the truth "
                            "labels for N epochs starting from --start-schedule-sampling-epoch; "
                            "all later epochs using P_N")
        parser.add_argument("--start-scheduled-sampling-epoch", type=int,
                            metavar="N", default=1,
                            help="start scheduled sampling from the specified epoch")
コード例 #27
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.decoder_layerdrop = args.decoder_layerdrop
        self.only_drop_topk = args.only_drop_topk
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)

        if self.decoder_layerdrop > 0.0:
            if self.only_drop_topk > 0:
                self.layers = PartLayerDropModuleList(
                    p=self.decoder_layerdrop,
                    top_k=self.only_drop_topk,
                    layer_num=args.decoder_layers)
            else:
                self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        self.adaptive_softmax = None
        self.output_projection = None
        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif self.share_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(self.output_embed_dim,
                                               len(dictionary),
                                               bias=False)
            nn.init.normal_(self.output_projection.weight,
                            mean=0,
                            std=self.output_embed_dim**-0.5)
コード例 #28
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--relu-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after ReLU in FFN')
     parser.add_argument('--input-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability of the inputs')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-conv-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument(
         '--encoder-attention-heads',
         type=int,
         metavar='N',
         help='num encoder attention heads or LightConv/DynamicConv heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-conv-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument(
         '--decoder-attention-heads',
         type=int,
         metavar='N',
         help='num decoder attention heads or LightConv/DynamicConv heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         '--encoder-kernel-size-list',
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31,31]")')
     parser.add_argument(
         '--decoder-kernel-size-list',
         type=lambda x: utils.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")')
     parser.add_argument('--encoder-glu',
                         type=utils.eval_bool,
                         help='glu after in proj')
     parser.add_argument('--decoder-glu',
                         type=utils.eval_bool,
                         help='glu after in proj')
     parser.add_argument('--encoder-conv-type',
                         default='dynamic',
                         type=str,
                         choices=['dynamic', 'lightweight'],
                         help='type of convolution')
     parser.add_argument('--decoder-conv-type',
                         default='dynamic',
                         type=str,
                         choices=['dynamic', 'lightweight'],
                         help='type of convolution')
     parser.add_argument('--weight-softmax',
                         default=True,
                         type=utils.eval_bool)
     parser.add_argument('--weight-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for conv weights')