Exemple #1
0
    def __init__(
        self,
        d_model,
        d_ff,
        cov_kernel_size,
        n_heads,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        conv_dropout=0.0,
        macaron_style=True,
        conv_first=False,
        ffn_scale=0.5,
        conv_bias=True,
        relative_positional=True,
        activation="glu",
    ):
        super(ConformerEncoderBlock, self).__init__()

        self.conv_first = conv_first
        self.macaron_style = macaron_style
        self.ffn_scale = ffn_scale
        self.relative_positional = relative_positional
        self.residual_dropout = residual_dropout

        if self.macaron_style:
            self.pre_ffn = PositionwiseFeedForward(d_model,
                                                   d_ff,
                                                   ffn_dropout,
                                                   activation=activation)
            self.macaron_ffn_norm = nn.LayerNorm(d_model)

        if self.relative_positional:
            self.mha = MultiHeadedSelfAttentionWithRelPos(
                n_heads, d_model, slf_attn_dropout)
        else:
            self.mha = MultiHeadedSelfAttention(n_heads, d_model,
                                                slf_attn_dropout)
        self.mha_norm = nn.LayerNorm(d_model)

        self.conv = ConformerConvolutionModule(d_model, cov_kernel_size,
                                               conv_bias, conv_dropout)
        self.conv_norm = nn.LayerNorm(d_model)

        self.post_ffn = PositionwiseFeedForward(d_model,
                                                d_ff,
                                                ffn_dropout,
                                                activation=activation)
        self.post_ffn_norm = nn.LayerNorm(d_model)

        self.final_norm = nn.LayerNorm(d_model)
Exemple #2
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight,
                        mean=0,
                        std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(
            d_k, 0.5),
                                                   attn_dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)
Exemple #3
0
    def __init__(
        self,
        vocab_size,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        memory_dim=256,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        src_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        activation="relu",
        normalize_before=True,
        concat_after=False,
        share_embedding=False,
    ):
        super(TransformerDecoder, self).__init__()

        self.decoder_type = "transformer"
        self.normalize_before = normalize_before
        self.relative_positional = False

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList(
            [
                TransformerDecoderLayer(
                    n_heads,
                    d_model,
                    d_ff,
                    memory_dim,
                    slf_attn_dropout,
                    src_attn_dropout,
                    ffn_dropout,
                    residual_dropout,
                    normalize_before=normalize_before,
                    concat_after=concat_after,
                    relative_positional=False,
                    activation=activation,
                )
                for _ in range(n_blocks)
            ]
        )

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, vocab_size)

        if share_embedding:
            assert self.embedding.weight.size() == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight
            logger.info("Tie the weights between the embedding and output layer.")
Exemple #4
0
    def __init__(
        self,
        n_heads,
        d_model,
        d_ff,
        memory_dim,
        slf_attn_dropout=0.0,
        src_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerDecoderLayer, self).__init__()

        self.relative_positional = relative_positional

        if self.relative_positional:
            self.slf_attn = MultiHeadedSelfAttentionWithRelPos(
                n_heads, d_model, slf_attn_dropout
            )
        else:
            self.slf_attn = MultiHeadedSelfAttention(n_heads, d_model, slf_attn_dropout)
        self.src_attn = MultiHeadedCrossAttention(
            n_heads, d_model, memory_dim, src_attn_dropout
        )
        self.feed_forward = PositionwiseFeedForward(
            d_model, d_ff, ffn_dropout, activation
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(residual_dropout)
        self.dropout2 = nn.Dropout(residual_dropout)
        self.dropout3 = nn.Dropout(residual_dropout)

        self.normalize_before = normalize_before
        self.concat_after = concat_after

        if self.concat_after:
            self.concat_linear1 = nn.Linear(d_model * 2, d_model)
            self.concat_linear2 = nn.Linear(d_model * 2, d_model)
Exemple #5
0
 def __init__(self,
              hidden_size,
              intermediate_size,
              layer_norm_eps=1e-5,
              dropout=0):
     super(BertOutput, self).__init__()
     self.dense = nn.Linear(intermediate_size, hidden_size)
     self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
     self.dropout = nn.Dropout(dropout)
Exemple #6
0
    def __init__(
        self,
        input_size,
        output_size,
        in_channel=1,
        mid_channel=32,
        out_channel=128,
        kernel_size=[[3, 3], [3, 3]],
        stride=[2, 2],
        dropout=0.0,
        act_func_type="relu",
        front_end_layer_norm=False,
    ):
        super(ConvFrontEnd, self).__init__()

        self.kernel_size = kernel_size
        self.stride = stride
        self.output_size = output_size

        self.act_func_type = act_func_type
        self.front_end_layer_norm = front_end_layer_norm

        assert isinstance(self.kernel_size, list) and len(
            self.kernel_size) == 2
        assert isinstance(self.stride, list) and len(self.stride) == 2

        self.conv1 = Conv2dLayer(
            input_size=input_size,
            in_channel=in_channel,
            out_channel=mid_channel,
            kernel_size=self.kernel_size[0],
            stride=self.stride[0],
            dropout=dropout,
            batch_norm=False,
            residual=False,
            act_func_type=act_func_type,
        )

        self.conv2 = Conv2dLayer(
            self.conv1.output_size,
            in_channel=mid_channel,
            out_channel=out_channel,
            kernel_size=self.kernel_size[1],
            stride=self.stride[1],
            dropout=dropout,
            batch_norm=False,
            residual=False,
            act_func_type=act_func_type,
        )

        self.conv_output_size = self.conv2.output_size * self.conv2.out_channel
        self.output_layer = nn.Linear(self.conv_output_size, self.output_size)

        if self.front_end_layer_norm:
            self.layer_norm = nn.LayerNorm(self.output_size)
Exemple #7
0
    def __init__(self, config: Callable[..., None]) -> None:
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size,
                                             padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
                                                  config.hidden_size)

        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
Exemple #8
0
    def __init__(
        self,
        vocab_size,
        type_vocab_size,
        max_position_embeddings,
        hidden_size,
        hidden_dropout_prob,
        seq_length,
    ):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        self.LayerNorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True)
        self.register_buffer(
            "position_ids", flow.arange(max_position_embeddings).unsqueeze(0)
        )
        self.seq_length = seq_length
Exemple #9
0
    def __init__(
        self,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerEncoder, self).__init__()

        self.normalize_before = normalize_before
        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                n_heads,
                d_model,
                d_ff,
                slf_attn_dropout,
                ffn_dropout,
                residual_dropout=residual_dropout,
                normalize_before=normalize_before,
                concat_after=concat_after,
                relative_positional=relative_positional,
                activation=activation,
            ) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.norm = nn.LayerNorm(d_model)
Exemple #10
0
    def __init__(self, params):
        super(TransformerLanguageModel, self).__init__(params)

        self.model_type = "transformer_lm"
        self.normalize_before = False
        self.smoothing = params["smoothing"]
        self.vocab_size = params["vocab_size"]
        self.num_blocks = params["num_blocks"]

        self.embedding = nn.Embedding(self.vocab_size, params["d_model"])
        self.pos_embedding = PositionalEncoding(params["d_model"], 0.0)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                params["n_heads"],
                params["d_model"],
                params["d_ff"],
                slf_attn_dropout=0.0,
                ffn_dropout=0.0,
                residual_dropout=params["residual_dropout"],
                normalize_before=False,
                concat_after=False,
                activation="glu",
            ) for _ in range(self.num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(params["d_model"])

        self.output_project = nn.Linear(params["d_model"], self.vocab_size)

        if params["share_embedding"]:
            self.output_project.weight = self.embedding.weight
            print("Share the weight of embedding to the output project layer!")

        self.crit = LabelSmoothingLoss(size=self.vocab_size,
                                       smoothing=self.smoothing,
                                       padding_idx=PAD)
Exemple #11
0
    def __init__(
        self,
        vocab_size,
        max_position_embeddings,
        type_vocab_size,
        hidden_size,
        layer_norm_eps=1e-5,
        dropout=0,
        pad_token_id=0,
        position_embedding_type="absolute",
    ):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size,
                                            hidden_size,
                                            padding_idx=pad_token_id)
        self.position_embeddings = nn.Embedding(max_position_embeddings,
                                                hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
        self.dropout = nn.Dropout(dropout)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.position_embedding_type = position_embedding_type
        self.register_buffer(
            "position_ids",
            flow.arange(max_position_embeddings).expand((1, -1)))
        self.register_buffer(
            "token_type_ids",
            flow.zeros(
                self.position_ids.size(),
                dtype=flow.int64,
                device=self.position_ids.device,
            ),
            persistent=False,
        )

        self.padding_idx = pad_token_id
Exemple #12
0
    def __init__(
        self,
        d_input,
        n_layers,
        n_head,
        d_k,
        d_v,
        d_model,
        d_inner,
        dropout=0.1,
        pe_maxlen=5000,
    ):
        super(Encoder, self).__init__()
        # parameters
        self.d_input = d_input
        self.n_layers = n_layers
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.d_inner = d_inner
        self.dropout_rate = dropout
        self.pe_maxlen = pe_maxlen
        self.n_layers = n_layers

        # use linear transformation with layer norm to replace input embedding
        self.linear_in = nn.Linear(d_input, d_model)
        self.layer_norm_in = nn.LayerNorm(d_model)
        self.positional_encoding = PositionalEncoding(d_model,
                                                      max_len=pe_maxlen)
        self.dropout = nn.Dropout(dropout)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)
        ])
Exemple #13
0
 def __init__(self, d_in, d_hid, dropout=0.1):
     super(PositionwiseFeedForwardUseConv, self).__init__()
     self.w_1 = nn.Conv1d(d_in, d_hid, 1)
     self.w_2 = nn.Conv1d(d_hid, d_in, 1)
     self.layer_norm = nn.LayerNorm(d_in)
     self.dropout = nn.Dropout(dropout)
Exemple #14
0
 def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob=0.1):
     super().__init__()
     self.dense = nn.Linear(intermediate_size, hidden_size)
     self.LayerNorm = nn.LayerNorm(hidden_size)
     self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True)
Exemple #15
0
 def __init__(self, config: Callable[..., None]) -> None:
     super().__init__()
     self.dense = nn.Linear(config.hidden_size, config.hidden_size)
     self.transform_act_fn = get_activation(config.hidden_act)
     self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
Exemple #16
0
 def __init__(self, hidden_size, hidden_act=nn.GELU()):
     super().__init__()
     self.dense = nn.Linear(hidden_size, hidden_size)
     self.transform_act_fn = hidden_act
     self.LayerNorm = nn.LayerNorm(hidden_size)
Exemple #17
0
 def __init__(self, d_model, d_ff, dropout=0.1):
     super(PositionwiseFeedForward, self).__init__()
     self.w_1 = nn.Linear(d_model, d_ff)
     self.w_2 = nn.Linear(d_ff, d_model)
     self.dropout = nn.Dropout(dropout)
     self.layer_norm = nn.LayerNorm(d_model)
Exemple #18
0
 def __init__(self,
              hidden_size: int,
              hidden_dropout_prob: float = 0.1) -> None:
     super().__init__()
     self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-12)
     self.dropout = nn.Dropout(hidden_dropout_prob)
Exemple #19
0
 def __init__(self, hidden_size, layer_norm_eps=1e-5, dropout=0):
     super().__init__()
     self.dense = nn.Linear(hidden_size, hidden_size)
     self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
     self.dropout = nn.Dropout(dropout)