Example #1
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        if not has_megatron_submodule:
            raise ImportError('\n\nPlease install the megatron submodule:'
                              '\n\n  git submodule update --init '
                              'fairseq/model_parallel/megatron')

        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if args.decoder_layers_to_keep:
            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))

        if getattr(args, 'max_target_positions', None) is None:
            args.max_target_positions = getattr(args, 'tokens_per_sample',
                                                DEFAULT_MAX_TARGET_POSITIONS)

        if args.character_embeddings:
            raise NotImplementedError(
                "Character embeddings is not supported for model parallel")
        elif args.adaptive_input:
            raise NotImplementedError(
                "Adaptive input is not supported for model parallel")
        else:
            embed_tokens = cls.build_embedding(args, task.source_dictionary,
                                               args.decoder_input_dim)

        decoder = ModelParallelTransformerDecoder(
            args,
            task.target_dictionary,
            embed_tokens,
            no_encoder_attn=True,
        )
        return cls(decoder)
def transformer_lm_megatron_11b(args):
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 3072)
    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072 * 6)
    args.decoder_layers = getattr(args, 'decoder_layers', 72)
    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 32)
    args.dropout = getattr(args, 'dropout', 0.1)
    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
    base_lm_architecture(args)
Example #3
0
    def __init__(self, dictionary):
        args = Namespace()
        base_lm_architecture(args)
        args.decoder_layerdrop=0
        args.max_target_positions = getattr(args, 'tokens_per_sample',DEFAULT_MAX_TARGET_POSITIONS)

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        emb = Embedding(num_embeddings, args.decoder_embed_dim, padding_idx)
        super().__init__(args, dictionary, emb, False)
Example #4
0
def transformer_numlm_base(args):
    args.add_number_token_attention_mask = getattr(
        args, "add_number_token_attention_mask", False)
    base_lm_architecture(args)