def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.self_attn = MultiheadAttention(self.embed_dim,
                                            args.encoder_attention_heads,
                                            dropout=args.attention_dropout,
                                            self_attention=True)
        self.self_attn_layer_norm = NormSelect(args.encoder_norm_self,
                                               self.embed_dim,
                                               args.encoder_attention_heads,
                                               args.warmup_updates)
        # self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        # self.final_layer_norm = LayerNorm(self.embed_dim)
        self.final_layer_norm = NormSelect(args.encoder_norm_ff,
                                           self.embed_dim,
                                           args.encoder_attention_heads,
                                           args.warmup_updates)

        if args.encoder_spec_norm:
            self.self_attn.q_proj = spectral_norm(self.self_attn.q_proj)
            self.self_attn.v_proj = spectral_norm(self.self_attn.v_proj)
Example #2
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = PositionalEmbedding(
            args.max_source_positions,
            embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layer_wise_attention = getattr(args, 'layer_wise_attention',
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerEncoderLayer(args) for i in range(args.encoder_layers)
        ])

        if args.encoder_normalize_before:
            # self.layer_norm = LayerNorm(embed_dim)
            self.layer_norm = NormSelect(args.encoder_norm_ff, embed_dim,
                                         args.encoder_attention_heads)
        else:
            self.layer_norm = None
        self.dp = DropoutSelect(args.dropout_type, args.dropout_gama)
    def __init__(
        self,
        embedding_dim: float = 768,
        ffn_embedding_dim: float = 3072,
        num_attention_heads: float = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = 'relu',
        add_bias_kv: bool = False,
        add_zero_attn: bool = False,
        export: bool = False,
        encoder_norm_self: str = 'layer',
        encoder_norm_ff: str = 'layer',
        encoder_normalize_before: bool = False,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout

        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.self_attn = MultiheadAttention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=True
        )

        # layer norm associated with the self attention layer
        # self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
        self.self_attn_layer_norm = NormSelect(encoder_norm_self, self.embedding_dim, num_attention_heads)
        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

        # layer norm associated with the position wise feed-forward NN
        # self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
        self.final_layer_norm = NormSelect(encoder_norm_ff, self.embedding_dim, num_attention_heads)
        self.encoder_normalize_before = encoder_normalize_before
Example #4
0
 def __init__(self, embed_dim, output_dim, activation_fn, weight=None, \
         num_attention_heads=1, encoder_norm_ff='layer'):
     super().__init__()
     self.dense = nn.Linear(embed_dim, embed_dim)
     self.activation_fn = utils.get_activation_fn(activation_fn)
     if encoder_norm_ff not in ['layer', 'group', 'ammlayer']:
         self.layer_norm = LayerNorm(embed_dim)
     else:
         self.layer_norm = NormSelect(encoder_norm_ff, embed_dim,
                                      num_attention_heads)
     if weight is None:
         weight = nn.Linear(embed_dim, output_dim, bias=False).weight
     self.weight = weight
     self.bias = nn.Parameter(torch.zeros(output_dim))
Example #5
0
    def __init__(
        self,
        padding_idx: int,
        vocab_size: int,
        num_encoder_layers: int = 6,
        embedding_dim: int = 768,
        ffn_embedding_dim: int = 3072,
        num_attention_heads: int = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        layerdrop: float = 0.0,
        max_seq_len: int = 256,
        num_segments: int = 2,
        use_position_embeddings: bool = True,
        offset_positions_by_padding: bool = True,
        encoder_normalize_before: bool = False,
        apply_bert_init: bool = False,
        activation_fn: str = "relu",
        learned_pos_embedding: bool = True,
        add_bias_kv: bool = False,
        add_zero_attn: bool = False,
        embed_scale: float = None,
        freeze_embeddings: bool = False,
        n_trans_layers_to_freeze: int = 0,
        export: bool = False,
        encoder_norm_self: str = 'layer',
        encoder_norm_ff: str = 'layer',
    ) -> None:

        super().__init__()
        self.padding_idx = padding_idx
        self.vocab_size = vocab_size
        self.dropout = dropout
        self.layerdrop = layerdrop
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim
        self.num_segments = num_segments
        self.use_position_embeddings = use_position_embeddings
        self.apply_bert_init = apply_bert_init
        self.learned_pos_embedding = learned_pos_embedding

        self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim,
                                         self.padding_idx)
        self.embed_scale = embed_scale

        self.segment_embeddings = (nn.Embedding(
            self.num_segments, self.embedding_dim, padding_idx=None)
                                   if self.num_segments > 0 else None)

        self.embed_positions = (PositionalEmbedding(
            self.max_seq_len,
            self.embedding_dim,
            padding_idx=(
                self.padding_idx if offset_positions_by_padding else None),
            learned=self.learned_pos_embedding,
        ) if self.use_position_embeddings else None)

        self.layers = nn.ModuleList([
            TransformerSentenceEncoderLayer(
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=ffn_embedding_dim,
                num_attention_heads=num_attention_heads,
                dropout=self.dropout,
                attention_dropout=attention_dropout,
                activation_dropout=activation_dropout,
                activation_fn=activation_fn,
                add_bias_kv=add_bias_kv,
                add_zero_attn=add_zero_attn,
                export=export,
                encoder_norm_self=encoder_norm_self,
                encoder_norm_ff=encoder_norm_ff,
                encoder_normalize_before=encoder_normalize_before)
            for _ in range(num_encoder_layers)
        ])

        if encoder_normalize_before:
            # self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
            self.emb_layer_norm = NormSelect(encoder_norm_ff,
                                             self.embedding_dim,
                                             num_attention_heads)
        else:
            self.emb_layer_norm = None

        # Apply initialization of model params after building the model
        if self.apply_bert_init:
            self.apply(init_bert_params)

        def freeze_module_params(m):
            if m is not None:
                for p in m.parameters():
                    p.requires_grad = False

        if freeze_embeddings:
            freeze_module_params(self.embed_tokens)
            freeze_module_params(self.segment_embeddings)
            freeze_module_params(self.embed_positions)
            freeze_module_params(self.emb_layer_norm)

        for layer in range(n_trans_layers_to_freeze):
            freeze_module_params(self.layers[layer])
Example #6
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.cross_self_attention = getattr(args, 'cross_self_attention',
                                            False)
        self.layer_wise_attention = getattr(args, 'layer_wise_attention',
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, 'no_decoder_final_norm', False):
            # self.layer_norm = LayerNorm(embed_dim)
            self.layer_norm = NormSelect(args.decoder_norm_ff, embed_dim,
                                         args.decoder_attention_heads)
        else:
            self.layer_norm = None
        self.dp = DropoutSelect(args.dropout_type, args.dropout_gama)
    def __init__(self,
                 args,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.cross_self_attention = getattr(args, 'cross_self_attention',
                                            False)
        self.self_attn = MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=args.decoder_attention_heads,
            dropout=args.attention_dropout,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=not self.cross_self_attention,
        )
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.decoder_normalize_before

        # use layerNorm rather than FusedLayerNorm for exporting.
        # char_inputs can be used to determint this.
        # TODO  remove this once we update apex with the fix
        export = getattr(args, 'char_inputs', False)
        # self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.self_attn_layer_norm = NormSelect(args.decoder_norm_self,
                                               self.embed_dim,
                                               args.decoder_attention_heads,
                                               args.warmup_updates)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                kdim=getattr(args, 'encoder_embed_dim', None),
                vdim=getattr(args, 'encoder_embed_dim', None),
                dropout=args.attention_dropout,
                encoder_decoder_attention=True,
            )
            # self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
            self.encoder_attn_layer_norm = NormSelect(
                args.decoder_norm_self, self.embed_dim,
                args.decoder_attention_heads, args.warmup_updates)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        # self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.final_layer_norm = NormSelect(args.decoder_norm_ff,
                                           self.embed_dim,
                                           args.decoder_attention_heads,
                                           args.warmup_updates)

        self.need_attn = True

        self.onnx_trace = False