Beispiel #1
0
    def __init__(self, args):
        super().__init__()
        self.encoders = []
        for i in range(0, 4):

            self.encoders.append({})
            self.encoders[i]['embed_dim'] = args.encoder_embed_dim
            self.encoders[i]['quant_noise'] = getattr(args, "quant_noise_pq", 0)
            self.encoders[i]['quant_noise_block_size'] = getattr(args, "quant_noise_pq_block_size", 8)

            self.encoders[i]['self_attn'] = self.build_self_attention(self.encoders[i]['embed_dim'], i, args)
            self.encoders[i]['self_attn_layer_norm'] = LayerNorm(self.encoders[i]['embed_dim'])
            self.encoders[i]['dropout_module'] = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
            self.encoders[i]['activation_fn'] = utils.get_activation_fn(
                activation=getattr(args, "activation_fn", "relu")
            )
            activation_dropout_p = getattr(args, "activation_dropout", 0)
            if activation_dropout_p == 0:
                # for backwards compatibility with models that use args.relu_dropout
                activation_dropout_p = getattr(args, "relu_dropout", 0)
            self.encoders[i]['activation_dropout_module'] = FairseqDropout(
                float(activation_dropout_p), module_name=self.__class__.__name__
            )
            self.encoders[i]['normalize_before'] = args.encoder_normalize_before
            self.encoders[i]['fc1'] = self.build_fc1(
                self.encoders[i]['embed_dim'], args.encoder_ffn_embed_dim, self.encoders[i]['quant_noise'], self.encoders[i]['quant_noise_block_size']
            )
            self.encoders[i]['fc2'] = self.build_fc2(
                args.encoder_ffn_embed_dim, self.encoders[i]['embed_dim'], self.encoders[i]['quant_noise'], self.encoders[i]['quant_noise_block_size']
            )

            self.encoders[i]['final_layer_norm'] = LayerNorm(self.encoders[i]['embed_dim'])
Beispiel #2
0
    def __init__(self,
                 cfg,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = cfg.decoder.embed_dim
        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=self.__class__.__name__)
        self.quant_noise = cfg.quant_noise.pq
        self.quant_noise_block_size = cfg.quant_noise.pq_block_size

        self.cross_self_attention = cfg.cross_self_attention

        self.self_attn = self.build_self_attention(
            self.embed_dim,
            cfg,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
        )

        self.activation_fn = utils.get_activation_fn(
            activation=cfg.activation_fn)
        activation_dropout_p = cfg.activation_dropout
        if activation_dropout_p == 0:
            # for backwards compatibility with models that use cfg.relu_dropout
            activation_dropout_p = cfg.relu_dropout or 0
        self.activation_dropout_module = FairseqDropout(
            float(activation_dropout_p), module_name=self.__class__.__name__)
        self.normalize_before = cfg.decoder.normalize_before

        self.self_attn_layer_norm = LayerNorm(self.embed_dim,
                                              export=cfg.export)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = self.build_encoder_attention(
                self.embed_dim, cfg)
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim,
                                                     export=cfg.export)

        self.fc1 = self.build_fc1(
            self.embed_dim,
            cfg.decoder.ffn_embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )
        self.fc2 = self.build_fc2(
            cfg.decoder.ffn_embed_dim,
            self.embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )

        self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
        self.need_attn = True

        self.onnx_trace = False
Beispiel #3
0
    def __init__(
        self,
        embedding_dim: int = 768,
        ffn_embedding_dim: int = 3072,
        num_attention_heads: int = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = "relu",
        export: bool = False,
        q_noise: float = 0.0,
        qn_block_size: int = 8,
        init_fn: Callable = None,
    ) -> None:
        super().__init__()

        if init_fn is not None:
            init_fn()

        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__
        )
        self.activation_dropout_module = FairseqDropout(
            activation_dropout, module_name=self.__class__.__name__
        )

        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.self_attn = self.build_self_attention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            self_attention=True,
            q_noise=q_noise,
            qn_block_size=qn_block_size,
        )

        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)

        self.fc1 = self.build_fc1(
            self.embedding_dim,
            ffn_embedding_dim,
            q_noise=q_noise,
            qn_block_size=qn_block_size,
        )
        self.fc2 = self.build_fc2(
            ffn_embedding_dim,
            self.embedding_dim,
            q_noise=q_noise,
            qn_block_size=qn_block_size,
        )

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
Beispiel #4
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False

        self.enable_fairseq_version = True
Beispiel #5
0
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim, args.encoder_attention_heads,
            dropout=args.attention_dropout, self_attention=True,
        )
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        # replace with gelu
        # self.activation_fn = utils.get_activation_fn(
        #    activation=getattr(args, 'activation_fn', 'relu') or "relu"
        # )
        # activation_dropout_p = getattr(args, "activation_dropout", 0) or 0
        # if activation_dropout_p == 0:
        #    # for backwards compatibility with models that use args.relu_dropout
        #    activation_dropout_p = getattr(args, "relu_dropout", 0) or 0
        # self.activation_dropout_module = FairseqDropout(
        #    float(activation_dropout_p), module_name=self.__class__.__name__
        # )

        self.normalize_before = args.encoder_normalize_before
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = nn.Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = nn.Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        # self.layer_norms = nn.ModuleList([BertLayerNorm(self.embed_dim) for i in range(2)])
        self.final_layer_norm = LayerNorm(self.embed_dim)
Beispiel #6
0
    def __init__(
        self,
        input_size,
        kernel_size=1,
        padding_l=None,
        weight_softmax=False,
        num_heads=1,
        weight_dropout=0.,
        bias=False,
        renorm_padding=False,
        conv_bias=False,
        query_size=None,
    ):

        super(DynamicconvLayer, self).__init__()
        self.input_size = input_size
        self.query_size = input_size if query_size is None else query_size
        self.kernel_size = kernel_size
        self.padding_l = padding_l
        self.num_heads = num_heads
        self.weight_softmax = weight_softmax
        self.weight_dropout_module = FairseqDropout(
            weight_dropout, module_name=self.__class__.__name__)
        self.renorm_padding = renorm_padding
        self.bias = bias

        self.weight_linear = nn.Linear(input_size, num_heads * kernel_size,
                                       bias)
        if conv_bias:
            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
        else:
            self.conv_bias = None
        self.reset_parameters()
Beispiel #7
0
 def __init__(self, args, input_dim, middle_dim, output_dim):
     super(ClassificationLayer, self).__init__()
     self.fc_1 = nn.Linear(input_dim, middle_dim)
     self.fc_2 = nn.Linear(middle_dim, output_dim)
     self.dropout = FairseqDropout(args.dropout,
                                   module_name=self.__class__.__name__)
     self.grad_reversal_scaling_factor = args.grad_reversal_scaling_factor
    def __init__(self, args):
        super().__init__()
        self.blockatt = args.use_module_communication == "True" or args.use_module_communication == "true"
        self.embed_dim = args.encoder_embed_dim
        self.quant_noise = getattr(args, "quant_noise_pq", 0)
        self.quant_noise_block_size = getattr(args,
                                              "quant_noise_pq_block_size", 8)

        print('encoder embed_dim', self.embed_dim)

        self.nb = args.num_modules
        self.norm_blocks = args.num_modules

        self.self_attn = self.build_self_attention(
            self.embed_dim, args
        )  #should divide embed_dim by nb.  Then raise embed_dim in args
        self.self_attn_layer_norm = NormLayer(
            self.norm_blocks, self.embed_dim // self.norm_blocks)
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "relu"))

        print("SETUP TRANSFORMER LAYER", 'blocks', self.nb)

        activation_dropout_p = getattr(args, "activation_dropout", 0)
        if activation_dropout_p == 0:
            # for backwards compatibility with models that use args.relu_dropout
            activation_dropout_p = getattr(args, "relu_dropout", 0)
        self.activation_dropout_module = FairseqDropout(
            float(activation_dropout_p), module_name=self.__class__.__name__)
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = self.build_fc1(self.embed_dim, args.encoder_ffn_embed_dim,
                                  self.quant_noise,
                                  self.quant_noise_block_size)
        self.fc2 = self.build_fc2(args.encoder_ffn_embed_dim, self.embed_dim,
                                  self.quant_noise,
                                  self.quant_noise_block_size)

        self.final_layer_norm = NormLayer(self.norm_blocks,
                                          self.embed_dim // self.norm_blocks)

        if self.blockatt:
            self.comm = Attention(args.encoder_attention_heads, self.nb,
                                  self.embed_dim)
            self.comm_norm = NormLayer(self.norm_blocks,
                                       self.embed_dim // self.norm_blocks)
Beispiel #9
0
 def __init__(self, args):
     super().__init__(args)
     self.domain_dropout = FairseqDropout(
         args.dropout, module_name=self.__class__.__name__)
     self.domain_projection = self.build_domain_projection(
         self.embed_dim,
         self.embed_dim,
         self.quant_noise,
         self.quant_noise_block_size,
     )
Beispiel #10
0
    def __init__(
        self,
        embed_dim,
        ffn_embed_dim,
        nhead,
        dropout,
        attn_dropout,
        activation_dropout,
        normalize_before=True,
        activation_fn="relu",
        quant_noise=0,
        quant_noise_block_size=8,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.quant_noise = quant_noise
        self.quant_noise_block_size = quant_noise_block_size
        self.self_attn = self.build_self_attention(self.embed_dim, nhead, attn_dropout)
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__
        )
        self.activation_fn = utils.get_activation_fn(activation=activation_fn)
        activation_dropout_p = activation_dropout
        self.activation_dropout_module = FairseqDropout(
            float(activation_dropout_p), module_name=self.__class__.__name__
        )
        self.normalize_before = normalize_before
        self.fc1 = self.build_fc1(
            self.embed_dim,
            ffn_embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )
        self.fc2 = self.build_fc2(
            ffn_embed_dim,
            self.embed_dim,
            self.quant_noise,
            self.quant_noise_block_size,
        )

        self.final_layer_norm = LayerNorm(self.embed_dim)
Beispiel #11
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=True,
        tie_kv=True,
        q_noise=0.0,
        qn_block_size=8,
        parallel=True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.parallel = parallel
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.pq_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                   q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.pc_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                   q_noise, qn_block_size)
        if tie_kv:
            self.c_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.k_proj = self.v_proj = None
        else:
            self.k_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.v_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.c_proj = None

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)
        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False
Beispiel #12
0
    def __init__(self, args, no_encoder_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.self_attn = MultiheadAttention(
            self.embed_dim, args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout

        '''
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "gelu")
        )
        self.activation_dropout = getattr(args, "activation_dropout", 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, "relu_dropout", 0)
        '''

        self.normalize_before = args.decoder_normalize_before

        export = getattr(args, "char_inputs", False)
        # self.self_attn_layer_norm = BertLayerNorm(self.embed_dim)
        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim, args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            # self.encoder_attn_layer_norm = BertLayerNorm(self.embed_dim)
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        self.fc1 = nn.Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = nn.Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        # self.final_layer_norm = BertLayerNorm(self.embed_dim)
        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.need_attn = True

        self.onnx_trace = False
Beispiel #13
0
    def __init__(
        self,
        cfg,
        return_fc=False,
        positional_embedding: Optional[RelativePositionalEmbedding] = None,
    ):
        super().__init__()
        self.cfg = cfg
        self.return_fc = return_fc
        self.embed_dim = cfg.encoder.embed_dim
        self.quant_noise = cfg.quant_noise.pq
        self.quant_noise_block_size = cfg.quant_noise.pq_block_size

        self.ffn1 = FeedForwardModule(
            input_feat=self.embed_dim,
            hidden_units=cfg.encoder.ffn_embed_dim,
            dropout1=cfg.activation_dropout,
            dropout2=cfg.dropout,
            activation_fn="swish",
        )

        self.self_attn = self.build_self_attention(
            self.embed_dim, cfg, positional_embedding=positional_embedding)
        self.self_attn_layer_norm = LayerNorm(self.embed_dim,
                                              export=cfg.export)
        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=self.__class__.__name__)

        self.conv_module = ConvolutionModule(
            embed_dim=self.embed_dim,
            channels=self.embed_dim,
            depthwise_kernel_size=cfg.encoder.depthwise_conv_kernel_size,
            dropout=cfg.dropout,
            activation_fn="swish",
        )

        self.ffn2 = FeedForwardModule(
            input_feat=self.embed_dim,
            hidden_units=cfg.encoder.ffn_embed_dim,
            dropout1=cfg.activation_dropout,
            dropout2=cfg.dropout,
            activation_fn="swish",
        )

        self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
Beispiel #14
0
    def __init__(self, args, dictionary, embed_tokens, embed_scale=None, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions
        self.eos_idx = dictionary.eos()
        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(args.encoder_embed_dim) if embed_scale is None else embed_scale
        self.embed_positions = PositionalEmbedding(
            args.max_source_positions, embed_dim, self.padding_idx,
            # left_pad=left_pad,
            learned=args.encoder_learned_pos,
        ) if not args.no_enc_token_positional_embeddings else None
        self.embed_lengths = nn.Embedding(args.max_target_positions, embed_dim)
        nn.init.normal_(self.embed_lengths.weight, mean=0, std=0.02)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerEncoderLayer(args)
            for i in range(args.encoder_layers)
        ])
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.encoder_normalize_before
        self.layer_norm = None
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)