Example #1
0
    def __init__(self, d_model, d_ff, n_heads, kernel_size,
                 dropout, dropout_att, dropout_layer,
                 layer_norm_eps, ffn_activation, param_init,
                 ffn_bottleneck_dim=0):
        super(ConformerEncoderBlock, self).__init__()

        self.n_heads = n_heads
        self.fc_factor = 0.5

        # first half position-wise feed-forward
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward1 = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                 ffn_bottleneck_dim)

        # conv module
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.conv = ConformerConvBlock(d_model, kernel_size, param_init)

        # self-attention
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = RelMHA(kdim=d_model,
                                qdim=d_model,
                                adim=d_model,
                                odim=d_model,
                                n_heads=n_heads,
                                dropout=dropout_att,
                                param_init=param_init)

        # second half position-wise feed-forward
        self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward2 = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                 ffn_bottleneck_dim)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer
Example #2
0
    def __init__(self,
                 d_model,
                 d_ff,
                 n_heads,
                 kernel_size,
                 dropout,
                 dropout_att,
                 dropout_layer,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 pe_type,
                 clamp_len,
                 ffn_bottleneck_dim,
                 unidirectional,
                 normalization='layer_norm'):
        super(ConformerEncoderBlock, self).__init__()

        self.n_heads = n_heads
        self.fc_factor = 0.5

        # first half position-wise feed-forward
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward_macaron = FFN(d_model, d_ff, dropout, ffn_activation,
                                        param_init, ffn_bottleneck_dim)

        # self-attention
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = RelMHA(kdim=d_model,
                                qdim=d_model,
                                adim=d_model,
                                odim=d_model,
                                n_heads=n_heads,
                                dropout=dropout_att,
                                param_init=param_init,
                                xl_like=pe_type == 'relative_xl',
                                clamp_len=clamp_len)

        # conv module
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.conv = ConformerConvBlock(d_model,
                                       kernel_size,
                                       param_init,
                                       normalization,
                                       causal=unidirectional)
        self.conv_context = kernel_size

        # second half position-wise feed-forward
        self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init, ffn_bottleneck_dim)

        self.norm5 = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer  # probability to skip
        logger.info('Stochastic depth prob: %.3f' % dropout_layer)

        self.reset_visualization()
Example #3
0
    def __init__(self,
                 d_model,
                 d_ff,
                 atype,
                 n_heads,
                 dropout,
                 dropout_att,
                 dropout_layer,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 memory_transformer=False):
        super(TransformerEncoderBlock, self).__init__()

        self.n_heads = n_heads
        self.memory_transformer = memory_transformer

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if memory_transformer else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             param_init=param_init)

        # feed-forward
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer
Example #4
0
    def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att,
                 dropout_layer, layer_norm_eps, ffn_activation, param_init):
        super(SyncBidirTransformerDecoderBlock, self).__init__()

        self.n_heads = n_heads

        # synchronous bidirectional attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        from neural_sp.models.modules.sync_bidir_multihead_attention import SyncBidirMultiheadAttentionMechanism as SyncBidirMHA
        self.self_attn = SyncBidirMHA(kdim=d_model,
                                      qdim=d_model,
                                      adim=d_model,
                                      n_heads=n_heads,
                                      dropout=dropout_att,
                                      param_init=param_init)

        # attention over encoder stacks
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.src_attn = MHA(kdim=d_model,
                            qdim=d_model,
                            adim=d_model,
                            n_heads=n_heads,
                            dropout=dropout_att,
                            param_init=param_init)

        # feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init)

        self.dropout = nn.Dropout(p=dropout)
Example #5
0
    def __init__(self, d_model, d_ff, n_heads,
                 dropout, dropout_att, dropout_layer,
                 layer_norm_eps, ffn_activation, param_init,
                 relative_attention=False, ffn_bottleneck_dim=0):
        super(TransformerEncoderBlock, self).__init__()

        self.n_heads = n_heads
        self.relative_attention = relative_attention

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if relative_attention else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             param_init=param_init)

        # position-wise feed-forward
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                ffn_bottleneck_dim)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer

        self.reset_visualization()
Example #6
0
    def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att,
                 dropout_layer, layer_norm_eps, ffn_activation, param_init,
                 pe_type, clamp_len, ffn_bottleneck_dim):
        super(TransformerEncoderBlock, self).__init__()

        self.n_heads = n_heads
        self.rel_attn = pe_type in ['relaive', 'relative_xl']

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if self.rel_attn else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             param_init=param_init,
                             xl_like=pe_type == 'relative_xl',
                             clamp_len=clamp_len)

        # position-wise feed-forward
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init, ffn_bottleneck_dim)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer

        self.reset_visualization()
Example #7
0
    def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att,
                 dropout_residual, layer_norm_eps, ffn_activation, param_init):
        super(SyncBidirTransformerDecoderBlock, self).__init__()

        self.n_heads = n_heads

        # synchronous bidirectional attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = SyncBidirMHA(kdim=d_model,
                                      qdim=d_model,
                                      adim=d_model,
                                      n_heads=n_heads,
                                      dropout=dropout_att,
                                      param_init=param_init)

        # attention over encoder stacks
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.src_attn = MHA(kdim=d_model,
                            qdim=d_model,
                            adim=d_model,
                            n_heads=n_heads,
                            dropout=dropout_att,
                            param_init=param_init)

        # feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init)

        self.dropout = nn.Dropout(p=dropout)
        self.death_rate = dropout_residual
    def __init__(self, d_model, d_ff, n_heads, kernel_size,
                 dropout, dropout_att, dropout_layer,
                 layer_norm_eps, ffn_activation, param_init,
                 pe_type, clamp_len, ffn_bottleneck_dim, unidirectional,
                 normalization='batch_norm'):
        super(ConformerEncoderBlock_v2, self).__init__()

        self.n_heads = n_heads
        self.fc_factor = 0.5

        # first half position-wise feed-forward
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward_macaron = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                        ffn_bottleneck_dim)

        # conv module
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.conv = ConformerConvBlock(d_model, kernel_size, param_init, normalization,
                                       causal=unidirectional)
        self.conv_context = kernel_size

        # self-attention
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = MHA(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             param_init=param_init)

        # second half position-wise feed-forward
        self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                ffn_bottleneck_dim)

        self.norm5 = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self.dropout = nn.Dropout(dropout)
        self.dropout_layer = dropout_layer

        self.reset_visualization()
Example #9
0
    def __init__(self,
                 d_model,
                 d_ff,
                 atype,
                 n_heads,
                 dropout,
                 dropout_att,
                 dropout_layer,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 src_tgt_attention=True,
                 memory_transformer=False,
                 mocha_chunk_size=0,
                 mocha_n_heads_mono=1,
                 mocha_n_heads_chunk=1,
                 mocha_init_r=2,
                 mocha_eps=1e-6,
                 mocha_std=1.0,
                 mocha_no_denominator=False,
                 mocha_1dconv=False,
                 dropout_head=0,
                 lm_fusion=False):
        super(TransformerDecoderBlock, self).__init__()

        self.atype = atype
        self.n_heads = n_heads
        self.src_tgt_attention = src_tgt_attention
        self.memory_transformer = memory_transformer

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if memory_transformer else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             param_init=param_init)

        # attention over encoder stacks
        if src_tgt_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if 'mocha' in atype:
                self.n_heads = mocha_n_heads_mono
                from neural_sp.models.modules.mocha import MoChA
                self.src_attn = MoChA(kdim=d_model,
                                      qdim=d_model,
                                      adim=d_model,
                                      atype='scaled_dot',
                                      chunk_size=mocha_chunk_size,
                                      n_heads_mono=mocha_n_heads_mono,
                                      n_heads_chunk=mocha_n_heads_chunk,
                                      init_r=mocha_init_r,
                                      eps=mocha_eps,
                                      noise_std=mocha_std,
                                      no_denominator=mocha_no_denominator,
                                      conv1d=mocha_1dconv,
                                      dropout=dropout_att,
                                      dropout_head=dropout_head,
                                      param_init=param_init)
            else:
                self.src_attn = MHA(kdim=d_model,
                                    qdim=d_model,
                                    adim=d_model,
                                    n_heads=n_heads,
                                    dropout=dropout_att,
                                    param_init=param_init)

        # feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init)

        self.dropout = nn.Dropout(p=dropout)
        self.dropout_layer = dropout_layer

        # LM fusion
        self.lm_fusion = lm_fusion
        if lm_fusion:
            self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps)
            # NOTE: LM should be projected to d_model in advance
            self.linear_lm_feat = nn.Linear(d_model, d_model)
            self.linear_lm_gate = nn.Linear(d_model * 2, d_model)
            self.linear_lm_fusion = nn.Linear(d_model * 2, d_model)
            if 'attention' in lm_fusion:
                self.lm_attn = MHA(kdim=d_model,
                                   qdim=d_model,
                                   adim=d_model,
                                   n_heads=n_heads,
                                   dropout=dropout_att,
                                   param_init=param_init)
Example #10
0
    def __init__(self, d_model, d_ff, atype, n_heads,
                 dropout, dropout_att, dropout_layer,
                 layer_norm_eps, ffn_activation, param_init,
                 src_tgt_attention=True, memory_transformer=False,
                 mma_chunk_size=0, mma_n_heads_mono=1, mma_n_heads_chunk=1,
                 mma_init_r=2, mma_eps=1e-6, mma_std=1.0,
                 mma_no_denominator=False, mma_1dconv=False,
                 dropout_head=0, share_chunkwise_attention=False,
                 lm_fusion='', ffn_bottleneck_dim=0):

        super().__init__()

        self.atype = atype
        self.n_heads = n_heads
        self.src_tgt_attention = src_tgt_attention
        self.memory_transformer = memory_transformer

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if memory_transformer else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             dropout_head=dropout_head,
                             param_init=param_init,
                             xl_like=memory_transformer)

        # attention over encoder stacks
        if src_tgt_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if 'mocha' in atype:
                self.n_heads = mma_n_heads_mono
                self.src_attn = MoChA(kdim=d_model,
                                      qdim=d_model,
                                      adim=d_model,
                                      odim=d_model,
                                      atype='scaled_dot',
                                      chunk_size=mma_chunk_size,
                                      n_heads_mono=mma_n_heads_mono,
                                      n_heads_chunk=mma_n_heads_chunk,
                                      init_r=mma_init_r,
                                      eps=mma_eps,
                                      noise_std=mma_std,
                                      no_denominator=mma_no_denominator,
                                      conv1d=mma_1dconv,
                                      dropout=dropout_att,
                                      dropout_head=dropout_head,
                                      param_init=param_init,
                                      share_chunkwise_attention=share_chunkwise_attention)
            else:
                self.src_attn = MHA(kdim=d_model,
                                    qdim=d_model,
                                    adim=d_model,
                                    odim=d_model,
                                    n_heads=n_heads,
                                    dropout=dropout_att,
                                    param_init=param_init)

        # position-wise feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init,
                                ffn_bottleneck_dim)

        self.dropout = nn.Dropout(p=dropout)
        self.dropout_layer = dropout_layer

        # LM fusion
        self.lm_fusion = lm_fusion
        if lm_fusion:
            self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps)
            # NOTE: LM should be projected to d_model in advance
            self.linear_lm_feat = nn.Linear(d_model, d_model)
            self.linear_lm_gate = nn.Linear(d_model * 2, d_model)
            self.linear_lm_fusion = nn.Linear(d_model * 2, d_model)
            if 'attention' in lm_fusion:
                self.lm_attn = MHA(kdim=d_model,
                                   qdim=d_model,
                                   adim=d_model,
                                   odim=d_model,
                                   n_heads=n_heads,
                                   dropout=dropout_att,
                                   param_init=param_init)

        self.reset_visualization()