def __init__(self, d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=0): super(ConformerEncoderBlock, self).__init__() self.n_heads = n_heads self.fc_factor = 0.5 # first half position-wise feed-forward self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward1 = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) # conv module self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.conv = ConformerConvBlock(d_model, kernel_size, param_init) # self-attention self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.self_attn = RelMHA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # second half position-wise feed-forward self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward2 = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer
def __init__(self, d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim, unidirectional, normalization='layer_norm'): super(ConformerEncoderBlock, self).__init__() self.n_heads = n_heads self.fc_factor = 0.5 # first half position-wise feed-forward self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward_macaron = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) # self-attention self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.self_attn = RelMHA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init, xl_like=pe_type == 'relative_xl', clamp_len=clamp_len) # conv module self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.conv = ConformerConvBlock(d_model, kernel_size, param_init, normalization, causal=unidirectional) self.conv_context = kernel_size # second half position-wise feed-forward self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.norm5 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer # probability to skip logger.info('Stochastic depth prob: %.3f' % dropout_layer) self.reset_visualization()
def __init__(self, d_model, d_ff, atype, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, memory_transformer=False): super(TransformerEncoderBlock, self).__init__() self.n_heads = n_heads self.memory_transformer = memory_transformer # self-attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) mha = RelMHA if memory_transformer else MHA self.self_attn = mha(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # feed-forward self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer
def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init): super(SyncBidirTransformerDecoderBlock, self).__init__() self.n_heads = n_heads # synchronous bidirectional attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) from neural_sp.models.modules.sync_bidir_multihead_attention import SyncBidirMultiheadAttentionMechanism as SyncBidirMHA self.self_attn = SyncBidirMHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # attention over encoder stacks self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.src_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # feed-forward self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init) self.dropout = nn.Dropout(p=dropout)
def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, relative_attention=False, ffn_bottleneck_dim=0): super(TransformerEncoderBlock, self).__init__() self.n_heads = n_heads self.relative_attention = relative_attention # self-attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) mha = RelMHA if relative_attention else MHA self.self_attn = mha(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # position-wise feed-forward self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer self.reset_visualization()
def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim): super(TransformerEncoderBlock, self).__init__() self.n_heads = n_heads self.rel_attn = pe_type in ['relaive', 'relative_xl'] # self-attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) mha = RelMHA if self.rel_attn else MHA self.self_attn = mha(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init, xl_like=pe_type == 'relative_xl', clamp_len=clamp_len) # position-wise feed-forward self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer self.reset_visualization()
def __init__(self, d_model, d_ff, n_heads, dropout, dropout_att, dropout_residual, layer_norm_eps, ffn_activation, param_init): super(SyncBidirTransformerDecoderBlock, self).__init__() self.n_heads = n_heads # synchronous bidirectional attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.self_attn = SyncBidirMHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # attention over encoder stacks self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.src_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # feed-forward self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init) self.dropout = nn.Dropout(p=dropout) self.death_rate = dropout_residual
def __init__(self, d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim, unidirectional, normalization='batch_norm'): super(ConformerEncoderBlock_v2, self).__init__() self.n_heads = n_heads self.fc_factor = 0.5 # first half position-wise feed-forward self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward_macaron = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) # conv module self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.conv = ConformerConvBlock(d_model, kernel_size, param_init, normalization, causal=unidirectional) self.conv_context = kernel_size # self-attention self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.self_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # second half position-wise feed-forward self.norm4 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.norm5 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout) self.dropout_layer = dropout_layer self.reset_visualization()
def __init__(self, d_model, d_ff, atype, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, src_tgt_attention=True, memory_transformer=False, mocha_chunk_size=0, mocha_n_heads_mono=1, mocha_n_heads_chunk=1, mocha_init_r=2, mocha_eps=1e-6, mocha_std=1.0, mocha_no_denominator=False, mocha_1dconv=False, dropout_head=0, lm_fusion=False): super(TransformerDecoderBlock, self).__init__() self.atype = atype self.n_heads = n_heads self.src_tgt_attention = src_tgt_attention self.memory_transformer = memory_transformer # self-attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) mha = RelMHA if memory_transformer else MHA self.self_attn = mha(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # attention over encoder stacks if src_tgt_attention: self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if 'mocha' in atype: self.n_heads = mocha_n_heads_mono from neural_sp.models.modules.mocha import MoChA self.src_attn = MoChA(kdim=d_model, qdim=d_model, adim=d_model, atype='scaled_dot', chunk_size=mocha_chunk_size, n_heads_mono=mocha_n_heads_mono, n_heads_chunk=mocha_n_heads_chunk, init_r=mocha_init_r, eps=mocha_eps, noise_std=mocha_std, no_denominator=mocha_no_denominator, conv1d=mocha_1dconv, dropout=dropout_att, dropout_head=dropout_head, param_init=param_init) else: self.src_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # feed-forward self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init) self.dropout = nn.Dropout(p=dropout) self.dropout_layer = dropout_layer # LM fusion self.lm_fusion = lm_fusion if lm_fusion: self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps) # NOTE: LM should be projected to d_model in advance self.linear_lm_feat = nn.Linear(d_model, d_model) self.linear_lm_gate = nn.Linear(d_model * 2, d_model) self.linear_lm_fusion = nn.Linear(d_model * 2, d_model) if 'attention' in lm_fusion: self.lm_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init)
def __init__(self, d_model, d_ff, atype, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, src_tgt_attention=True, memory_transformer=False, mma_chunk_size=0, mma_n_heads_mono=1, mma_n_heads_chunk=1, mma_init_r=2, mma_eps=1e-6, mma_std=1.0, mma_no_denominator=False, mma_1dconv=False, dropout_head=0, share_chunkwise_attention=False, lm_fusion='', ffn_bottleneck_dim=0): super().__init__() self.atype = atype self.n_heads = n_heads self.src_tgt_attention = src_tgt_attention self.memory_transformer = memory_transformer # self-attention self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) mha = RelMHA if memory_transformer else MHA self.self_attn = mha(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, dropout_head=dropout_head, param_init=param_init, xl_like=memory_transformer) # attention over encoder stacks if src_tgt_attention: self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if 'mocha' in atype: self.n_heads = mma_n_heads_mono self.src_attn = MoChA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, atype='scaled_dot', chunk_size=mma_chunk_size, n_heads_mono=mma_n_heads_mono, n_heads_chunk=mma_n_heads_chunk, init_r=mma_init_r, eps=mma_eps, noise_std=mma_std, no_denominator=mma_no_denominator, conv1d=mma_1dconv, dropout=dropout_att, dropout_head=dropout_head, param_init=param_init, share_chunkwise_attention=share_chunkwise_attention) else: self.src_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) # position-wise feed-forward self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation, param_init, ffn_bottleneck_dim) self.dropout = nn.Dropout(p=dropout) self.dropout_layer = dropout_layer # LM fusion self.lm_fusion = lm_fusion if lm_fusion: self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps) # NOTE: LM should be projected to d_model in advance self.linear_lm_feat = nn.Linear(d_model, d_model) self.linear_lm_gate = nn.Linear(d_model * 2, d_model) self.linear_lm_fusion = nn.Linear(d_model * 2, d_model) if 'attention' in lm_fusion: self.lm_attn = MHA(kdim=d_model, qdim=d_model, adim=d_model, odim=d_model, n_heads=n_heads, dropout=dropout_att, param_init=param_init) self.reset_visualization()