def __init__( self, d_model: int = 512, # dimension of model num_heads: int = 8, # number of attention heads d_ff: int = 2048, # dimension of feed forward network dropout_p: float = 0.3, # probability of dropout ) -> None: super(TransformerEncoderLayer, self).__init__() self.attention_prenorm = LayerNorm(d_model) self.feed_forward_prenorm = LayerNorm(d_model) self.self_attention = MultiHeadAttention(d_model, num_heads) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_p)
def __init__( self, d_model: int = 512, # dimension of model input_dim: int = 80, # dimension of feature vector d_ff: int = 2048, # dimension of feed forward network num_layers: int = 6, # number of encoder layers num_heads: int = 8, # number of attention heads ffnet_style: str = 'ff', # style of feed forward network [ff, conv] dropout_p: float = 0.3, # probability of dropout pad_id: int = 0, # identification of pad token ) -> None: super(SpeechTransformerEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.pad_id = pad_id self.input_proj = Linear(input_dim, d_model) self.input_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.positional_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList([ SpeechTransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers) ])
def __init__( self, input_dim: int, num_classes: int, rnn_type='gru', num_rnn_layers: int = 5, rnn_hidden_dim: int = 512, dropout_p: float = 0.1, bidirectional: bool = True, activation: str = 'hardtanh', device: torch.device = 'cuda', ): super(DeepSpeech2, self).__init__() self.device = device self.conv = DeepSpeech2Extractor(input_dim, activation=activation) self.rnn_layers = nn.ModuleList() rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim for idx in range(num_rnn_layers): self.rnn_layers.append( BNReluRNN( input_size=self.conv.get_output_dim() if idx == 0 else rnn_output_size, hidden_state_dim=rnn_hidden_dim, rnn_type=rnn_type, bidirectional=bidirectional, dropout_p=dropout_p, )) self.fc = nn.Sequential( LayerNorm(rnn_output_size), Linear(rnn_output_size, num_classes, bias=False), )
def __init__( self, input_dim: int, # dimension of feature vector extractor: str = 'vgg', # convolutional extractor d_model: int = 512, # dimension of model d_ff: int = 2048, # dimension of feed forward network num_layers: int = 6, # number of encoder layers num_heads: int = 8, # number of attention heads dropout_p: float = 0.3, # probability of dropout joint_ctc_attention: bool = False, # use CTC Loss & Cross Entropy Joint Learning num_classes: int = None, # number of classification ) -> None: super(TransformerEncoder, self).__init__(input_dim=input_dim, extractor=extractor, d_model=d_model, num_classes=num_classes, dropout_p=dropout_p, joint_ctc_attention=joint_ctc_attention) self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.input_proj = Linear(self.conv_output_dim, d_model) self.input_layer_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.positional_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList([ TransformerEncoderLayer( d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout_p=dropout_p, ) for _ in range(num_layers) ])
def __init__( self, encoder_dim: int = 512, num_attention_heads: int = 8, feed_forward_expansion_factor: int = 4, conv_expansion_factor: int = 2, feed_forward_dropout_p: float = 0.1, attention_dropout_p: float = 0.1, conv_dropout_p: float = 0.1, conv_kernel_size: int = 31, half_step_residual: bool = True, device: torch.device = 'cuda', ): super(ConformerBlock, self).__init__() self.device = device if half_step_residual: self.feed_forward_residual_factor = 0.5 else: self.feed_forward_residual_factor = 1 self.sequential = nn.Sequential( ResidualConnectionModule( module=FeedForwardModule( encoder_dim=encoder_dim, expansion_factor=feed_forward_expansion_factor, dropout_p=feed_forward_dropout_p, device=device, ), module_factor=self.feed_forward_residual_factor, ), ResidualConnectionModule( module=MultiHeadedSelfAttentionModule( d_model=encoder_dim, num_heads=num_attention_heads, dropout_p=attention_dropout_p, ), ), ResidualConnectionModule( module=ConformerConvModule( in_channels=encoder_dim, kernel_size=conv_kernel_size, expansion_factor=conv_expansion_factor, dropout_p=conv_dropout_p, ), ), ResidualConnectionModule( module=FeedForwardModule( encoder_dim=encoder_dim, expansion_factor=feed_forward_expansion_factor, dropout_p=feed_forward_dropout_p, ), module_factor=self.feed_forward_residual_factor, ), LayerNorm(encoder_dim), )
def __init__(self, d_model: int, num_heads: int, dropout_p: float = 0.1, device: torch.device = 'cuda'): super(MultiHeadedSelfAttentionModule, self).__init__() self.positional_encoding = PositionalEncoding(d_model) self.layer_norm = LayerNorm(d_model) self.attention = RelativeMultiHeadAttention(d_model, num_heads, dropout_p) self.dropout = nn.Dropout(p=dropout_p) self.device = device
def __init__(self, encoder_dim: int = 512, expansion_factor: int = 4, dropout_p: float = 0.1, device: torch.device = 'cuda') -> None: super(FeedForwardModule, self).__init__() self.device = device self.sequential = nn.Sequential( LayerNorm(encoder_dim), Linear(encoder_dim, encoder_dim * expansion_factor, bias=True), Swish(), nn.Dropout(p=dropout_p), Linear(encoder_dim * expansion_factor, encoder_dim, bias=True), nn.Dropout(p=dropout_p), )
def __init__(self, d_model: int = 512, input_dim: int = 80, d_ff: int = 2048, num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff', dropout_p: float = 0.3, pad_id: int = 0) -> None: super(TransformerEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.pad_id = pad_id self.input_proj = Linear(input_dim, d_model) self.input_layer_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.pos_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList( [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers)] )
def __init__( self, in_channels: int, kernel_size: int = 31, expansion_factor: int = 2, dropout_p: float = 0.1, device: torch.device = 'cuda', ) -> None: super(ConformerConvModule, self).__init__() assert ( kernel_size - 1 ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2" self.device = device self.sequential = nn.Sequential( LayerNorm(in_channels), Transpose(shape=(1, 2)), PointwiseConv1d(in_channels, in_channels * expansion_factor, stride=1, padding=0, bias=True), GLU(dim=1), DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2), nn.BatchNorm1d(in_channels), Swish(), PointwiseConv1d(in_channels, in_channels, stride=1, padding=0, bias=True), nn.Dropout(p=dropout_p), )
def __init__( self, num_classes: int, # number of classes d_model: int = 512, # dimension of model d_ff: int = 512, # dimension of feed forward network num_layers: int = 6, # number of decoder layers num_heads: int = 8, # number of attention heads dropout_p: float = 0.3, # probability of dropout pad_id: int = 0, # identification of pad token sos_id: int = 1, # identification of start of sentence token eos_id: int = 2, # identification of end of sentence token max_length: int = 400, # max length of decoding ) -> None: super(TransformerDecoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.max_length = max_length self.pad_id = pad_id self.sos_id = sos_id self.eos_id = eos_id self.embedding = Embedding(num_classes, pad_id, d_model) self.positional_encoding = PositionalEncoding(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.layers = nn.ModuleList([ TransformerDecoderLayer( d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout_p=dropout_p, ) for _ in range(num_layers) ]) self.fc = nn.Sequential( LayerNorm(d_model), Linear(d_model, num_classes, bias=False), )
def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None: super(AddNorm, self).__init__() self.sublayer = sublayer self.layer_norm = LayerNorm(d_model)