コード例 #1
0
def transformer_lm_gpt3_175(args):
    # 175B params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 96)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 96)
    base_gpt3_architecture(args)
コード例 #2
0
def transformer_lm_gpt3_6_7(args):
    # 6.7B params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 32)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 32)
    base_gpt3_architecture(args)
コード例 #3
0
def transformer_lm_gpt3_13(args):
    # 13B params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 40)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 40)
    base_gpt3_architecture(args)
コード例 #4
0
def transformer_lm_gpt3_large(args):
    # 760M params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 16)
    base_gpt3_architecture(args)
コード例 #5
0
def transformer_lm_gpt3_xl(args):
    # 1.3B params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 32)
    base_gpt3_architecture(args)
コード例 #6
0
def transformer_lm_gpt3_small(args):
    # 125M params
    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 12)
    base_gpt3_architecture(args)
コード例 #7
0
def transformer_lm_baevski_gbw(args):
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
    args.dropout = safe_getattr(args, "dropout", 0.1)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm",
                                              True)
    transformer_lm_big(args)
コード例 #8
0
def transformer_lm_big(args):
    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim",
                                              4096)
    args.decoder_attention_heads = safe_getattr(args,
                                                "decoder_attention_heads", 16)
    base_lm_architecture(args)
コード例 #9
0
def xlm_architecture(args):
    args.encoder_layers = safe_getattr(args, "encoder_layers", 16)
    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280)
    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim",
                                              1280 * 4)
    args.encoder_attention_heads = safe_getattr(args,
                                                "encoder_attention_heads", 16)
    base_architecture(args)
コード例 #10
0
def roberta_large_architecture(args):
    args.encoder_layers = safe_getattr(args, "encoder_layers", 24)
    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024)
    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim",
                                              4096)
    args.encoder_attention_heads = safe_getattr(args,
                                                "encoder_attention_heads", 16)
    base_architecture(args)
コード例 #11
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
def base_gpt3_architecture(args):
    args.decoder_input_dim = args.decoder_embed_dim
    args.decoder_output_dim = args.decoder_embed_dim
    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4)
    # GPT-3 used learned positional embeddings, rather than sinusoidal
    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True)
    args.dropout = safe_getattr(args, "dropout", 0.0)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)
    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
    args.share_decoder_input_output_embed = True
    base_lm_architecture(args)
コード例 #12
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
    def build_model(cls, args, task):
        """Build a new model instance."""

        if args.decoder_layers_to_keep:
            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))

        if safe_getattr(args, "max_target_positions", None) is None:
            args.max_target_positions = safe_getattr(
                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
            )

        if args.character_embeddings:
            embed_tokens = CharacterTokenEmbedder(
                task.source_dictionary,
                eval(args.character_filters),
                args.character_embedding_dim,
                args.decoder_embed_dim,
                args.char_embedder_highway_layers,
            )
        elif args.adaptive_input:
            embed_tokens = AdaptiveInput(
                len(task.source_dictionary),
                task.source_dictionary.pad(),
                args.decoder_input_dim,
                args.adaptive_input_factor,
                args.decoder_embed_dim,
                options.eval_str_list(args.adaptive_input_cutoff, type=int),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            embed_tokens = cls.build_embedding(
                args, task.source_dictionary, args.decoder_input_dim
            )

        if args.tie_adaptive_weights:
            assert args.adaptive_input
            assert args.adaptive_input_factor == args.adaptive_softmax_factor
            assert (
                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
            ), "{} != {}".format(
                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
            )
            assert args.decoder_input_dim == args.decoder_output_dim

        decoder = TransformerDecoder(
            args, task.target_dictionary, embed_tokens, no_encoder_attn=True
        )
        return cls(decoder)
コード例 #13
0
 def __setattr__(self, name, value):
     match = re.match(_NAME_PARSER, name)
     if match:
         sub = safe_getattr(self, match[1])
         setattr(sub, match[2], value)
     else:
         super().__setattr__(name, value)
コード例 #14
0
 def _copy_keys(args, cls, prefix, seen):
     """
     copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim
     """
     cfg = cls()
     for fld in fields(cls):
         # for all the fields in the DC, find the fields (e.g. embed_dim)
         # in the namespace with the prefix (e.g. decoder)
         # and set it on the dc.
         args_key = f"{prefix}_{fld.name}"
         if safe_hasattr(args, args_key):
             seen.add(args_key)
             setattr(cfg, fld.name, safe_getattr(args, args_key))
         if safe_hasattr(args, fld.name):
             seen.add(fld.name)
             setattr(cfg, fld.name, safe_getattr(args, fld.name))
     return cfg
コード例 #15
0
 def from_namespace(cls, args):
     if args is None:
         return None
     if not isinstance(args, cls):
         seen = set()
         config = cls()
         # currently, we can go generically from DC fields to args hierarchically
         # but we can't easily deconstruct a flat namespace to a hierarchical
         # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not
         # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple
         # for now.
         for fld in fields(cls):
             # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields
             # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()`
             if fld.name == "decoder":
                 if safe_hasattr(args, "decoder"):
                     #  in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC
                     seen.add("decoder")
                     config.decoder = DecoderConfig(**args.decoder)
                 else:
                     config.decoder = cls._copy_keys(
                         args, DecoderConfig, "decoder", seen)
             elif fld.name == "encoder":
                 # same but for encoder
                 if safe_hasattr(args, "encoder"):
                     seen.add("encoder")
                     config.encoder = EncDecBaseConfig(**args.encoder)
                 else:
                     config.encoder = cls._copy_keys(
                         args, EncDecBaseConfig, "encoder", seen)
             elif fld.name == "quant_noise":
                 # same but for quant_noise
                 if safe_hasattr(args, "quant_noise"):
                     seen.add("quant_noise")
                     config.quant_noise = QuantNoiseConfig(
                         **args.quant_noise)
                 else:
                     config.quant_noise = cls._copy_keys(
                         args, QuantNoiseConfig, "quant_noise", seen)
             elif safe_hasattr(args, fld.name):
                 # if it's not a structure field, it's just a normal field, copy it over
                 seen.add(fld.name)
                 setattr(config, fld.name, safe_getattr(args, fld.name))
         # we got all the fields defined in the dataclass, but
         # the argparse namespace might have extra args for two reasons:
         #   - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this
         #   - some places expect args to be there but never define them
         args_dict = (args._asdict() if safe_hasattr(args, "_asdict") else
                      vars(args) if safe_hasattr(args, "__dict__") else {}
                      )  # namedtupled doesn't have __dict__ :-/
         for key, value in args_dict.items():
             if key not in seen:
                 setattr(config, key, value)
         return config
     else:
         return args
コード例 #16
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
def transformer_lm_gpt2_big(args):
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600)
    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400)
    args.decoder_layers = safe_getattr(args, "decoder_layers", 48)
    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25)
    args.dropout = safe_getattr(args, "dropout", 0.1)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
    base_lm_architecture(args)
コード例 #17
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
def transformer_lm_gpt2_medium(args):
    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280)
    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120)
    args.decoder_layers = safe_getattr(args, "decoder_layers", 36)
    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20)
    args.dropout = safe_getattr(args, "dropout", 0.1)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
    base_lm_architecture(args)
コード例 #18
0
 def build_self_attention_selection(self,
                                    embed_dim,
                                    args,
                                    self_attn_head_selector=None,
                                    add_bias_kv=False,
                                    add_zero_attn=False):
     return MultiheadAttentionSelection(
         embed_dim,
         args.total_decoder_attention_heads,
         args.decoder_attention_heads,
         dropout=args.attention_dropout,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         self_attention=not safe_getattr(args, "cross_self_attention"),
         q_noise=self.quant_noise,
         qn_block_size=self.quant_noise_block_size,
         layer_idx=self.layer_idx,
         attn_head_selector=self_attn_head_selector,
     )
コード例 #19
0
 def __getattr__(self, name):
     match = re.match(_NAME_PARSER, name)
     if match:
         sub = safe_getattr(self, match[1])
         return safe_getattr(sub, match[2])
     raise AttributeError(f"invalid argument {name}.")
コード例 #20
0
def base_architecture(args):
    args.encoder_layers = safe_getattr(args, "encoder_layers", 12)
    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768)
    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim",
                                              3072)
    args.encoder_attention_heads = safe_getattr(args,
                                                "encoder_attention_heads", 12)

    args.dropout = safe_getattr(args, "dropout", 0.1)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0)
    args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0)

    args.max_source_positions = safe_getattr(args, "max_positions", 512)
    args.no_token_positional_embeddings = safe_getattr(
        args, "no_token_positional_embeddings", False)

    # BERT has a few structural differences compared to the original Transformer
    args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True)
    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True)
    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True)
    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
    args.encoder_normalize_before = safe_getattr(args,
                                                 "encoder_normalize_before",
                                                 False)
    args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn",
                                             "tanh")
    args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta",
                                              False)

    # Adaptive input config
    args.adaptive_input = safe_getattr(args, "adaptive_input", False)

    # LayerDrop config
    args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0)
    args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep",
                                               None)

    # Quantization noise config
    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
    args.quant_noise_pq_block_size = safe_getattr(args,
                                                  "quant_noise_pq_block_size",
                                                  8)
    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)

    # R4F config
    args.spectral_norm_classification_head = safe_getattr(
        args, "spectral_norm_classification_head", False)
コード例 #21
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
def transformer_lm_baevski_wiki103(args):
    args.decoder_layers = safe_getattr(args, "decoder_layers", 16)
    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
    args.dropout = safe_getattr(args, "dropout", 0.3)
    args.adaptive_input = safe_getattr(args, "adaptive_input", True)
    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True)
    args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", "20000,60000")
    args.adaptive_softmax_cutoff = safe_getattr(
        args, "adaptive_softmax_cutoff", "20000,60000"
    )
    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1)
    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True)
    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True)
    transformer_lm_big(args)
コード例 #22
0
ファイル: transformer_lm.py プロジェクト: sdadas/fairseq
def base_lm_architecture(args):
    # backward compatibility for older model checkpoints
    if safe_hasattr(args, "no_tie_adaptive_proj"):
        # previous models defined --no-tie-adaptive-proj, so use the existence of
        # that option to determine if this is an "old" model checkpoint
        args.no_decoder_final_norm = True  # old models always set this to True
        if args.no_tie_adaptive_proj is False:
            args.tie_adaptive_proj = True
    if safe_hasattr(args, "decoder_final_norm"):
        args.no_decoder_final_norm = not args.decoder_final_norm

    args.dropout = safe_getattr(args, "dropout", 0.1)
    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)

    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048)
    args.decoder_layers = safe_getattr(args, "decoder_layers", 6)
    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
    args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None)
    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0)
    args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4)
    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False)
    args.activation_fn = safe_getattr(args, "activation_fn", "relu")

    args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0)
    args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None)
    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
    args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8)
    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)

    args.base_layers = safe_getattr(args, "base_layers", 0)
    args.base_sublayers = safe_getattr(args, "base_sublayers", 1)
    args.base_shuffle = safe_getattr(args, "base_shuffle", False)

    args.add_bos_token = safe_getattr(args, "add_bos_token", False)
    args.no_token_positional_embeddings = safe_getattr(
        args, "no_token_positional_embeddings", False
    )
    args.share_decoder_input_output_embed = safe_getattr(
        args, "share_decoder_input_output_embed", False
    )
    args.character_embeddings = safe_getattr(args, "character_embeddings", False)

    args.decoder_output_dim = safe_getattr(
        args, "decoder_output_dim", args.decoder_embed_dim
    )
    args.decoder_input_dim = safe_getattr(args, "decoder_input_dim", args.decoder_embed_dim)

    # Model training is not stable without this
    args.decoder_normalize_before = True
    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False)

    args.adaptive_input = safe_getattr(args, "adaptive_input", False)
    args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4)
    args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None)

    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False)
    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False)

    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False)
    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
    args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False)
    args.offload_activations = safe_getattr(args, "offload_activations", False)
    if args.offload_activations:
        args.checkpoint_activations = True
コード例 #23
0
def roberta_prenorm_architecture(args):
    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
    args.encoder_normalize_before = safe_getattr(args,
                                                 "encoder_normalize_before",
                                                 True)
    base_architecture(args)
コード例 #24
0
    def __init__(
        self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
    ):
        super().__init__()
        self.embed_dim = cfg.decoder.embed_dim
        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=self.__class__.__name__
        )
        self.quant_noise = cfg.quant_noise.pq
        self.quant_noise_block_size = cfg.quant_noise.pq_block_size

        self.cross_self_attention = cfg.cross_self_attention

        self.self_attn = self.build_self_attention(
            self.embed_dim,
            cfg,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
        )
        self.attn_ln = (
            LayerNorm(self.embed_dim)
            if utils.safe_getattr(cfg, "scale_attn", False)
            else None
        )
        self.nh = self.self_attn.num_heads
        self.head_dim = self.self_attn.head_dim
        scale_heads = utils.safe_getattr(cfg, "scale_heads", False)
        self.c_attn = (
            nn.Parameter(torch.ones((self.nh,)), requires_grad=True)
            if scale_heads
            else None
        )

        self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn)
        activation_dropout_p = cfg.activation_dropout
        if activation_dropout_p == 0:
            # for backwards compatibility with models that use cfg.relu_dropout
            activation_dropout_p = cfg.relu_dropout or 0
        self.activation_dropout_module = FairseqDropout(
            float(activation_dropout_p), module_name=self.__class__.__name__
        )
        self.normalize_before = cfg.decoder.normalize_before

        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg)
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)

        self.ffn_layernorm = (
            LayerNorm(cfg.decoder.ffn_embed_dim)
            if utils.safe_getattr(cfg, "scale_fc", False)
            else None
        )
        self.w_resid = (
            nn.Parameter(
                torch.ones(
                    self.embed_dim,
                ),
                requires_grad=True,
            )
            if utils.safe_getattr(cfg, "scale_resids", False)
            else None
        )

        self.fc1 = self.build_fc1(
            self.embed_dim,
            cfg.decoder.ffn_embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )
        self.fc2 = self.build_fc2(
            cfg.decoder.ffn_embed_dim,
            self.embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )

        self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
        self.need_attn = True

        self.onnx_trace = False