Esempio n. 1
0
 def __init__(
         self,
         d_model: int = 512,  # dimension of model
         num_heads: int = 8,  # number of attention heads
         d_ff: int = 2048,  # dimension of feed forward network
         dropout_p: float = 0.3,  # probability of dropout
 ) -> None:
     super(TransformerEncoderLayer, self).__init__()
     self.attention_prenorm = LayerNorm(d_model)
     self.feed_forward_prenorm = LayerNorm(d_model)
     self.self_attention = MultiHeadAttention(d_model, num_heads)
     self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_p)
Esempio n. 2
0
 def __init__(
         self,
         d_model: int = 512,  # dimension of model
         input_dim: int = 80,  # dimension of feature vector
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         ffnet_style: str = 'ff',  # style of feed forward network [ff, conv]
         dropout_p: float = 0.3,  # probability of dropout
         pad_id: int = 0,  # identification of pad token
 ) -> None:
     super(SpeechTransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         SpeechTransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p,
                                       ffnet_style)
         for _ in range(num_layers)
     ])
Esempio n. 3
0
    def __init__(
        self,
        input_dim: int,
        num_classes: int,
        rnn_type='gru',
        num_rnn_layers: int = 5,
        rnn_hidden_dim: int = 512,
        dropout_p: float = 0.1,
        bidirectional: bool = True,
        activation: str = 'hardtanh',
        device: torch.device = 'cuda',
    ):
        super(DeepSpeech2, self).__init__()
        self.device = device
        self.conv = DeepSpeech2Extractor(input_dim, activation=activation)
        self.rnn_layers = nn.ModuleList()
        rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim

        for idx in range(num_rnn_layers):
            self.rnn_layers.append(
                BNReluRNN(
                    input_size=self.conv.get_output_dim()
                    if idx == 0 else rnn_output_size,
                    hidden_state_dim=rnn_hidden_dim,
                    rnn_type=rnn_type,
                    bidirectional=bidirectional,
                    dropout_p=dropout_p,
                ))

        self.fc = nn.Sequential(
            LayerNorm(rnn_output_size),
            Linear(rnn_output_size, num_classes, bias=False),
        )
Esempio n. 4
0
 def __init__(
         self,
         input_dim: int,  # dimension of feature vector
         extractor: str = 'vgg',  # convolutional extractor
         d_model: int = 512,  # dimension of model
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         dropout_p: float = 0.3,  # probability of dropout
         joint_ctc_attention:
     bool = False,  # use CTC Loss & Cross Entropy Joint Learning
         num_classes: int = None,  # number of classification
 ) -> None:
     super(TransformerEncoder,
           self).__init__(input_dim=input_dim,
                          extractor=extractor,
                          d_model=d_model,
                          num_classes=num_classes,
                          dropout_p=dropout_p,
                          joint_ctc_attention=joint_ctc_attention)
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.input_proj = Linear(self.conv_output_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         TransformerEncoderLayer(
             d_model=d_model,
             num_heads=num_heads,
             d_ff=d_ff,
             dropout_p=dropout_p,
         ) for _ in range(num_layers)
     ])
Esempio n. 5
0
    def __init__(
            self,
            encoder_dim: int = 512,
            num_attention_heads: int = 8,
            feed_forward_expansion_factor: int = 4,
            conv_expansion_factor: int = 2,
            feed_forward_dropout_p: float = 0.1,
            attention_dropout_p: float = 0.1,
            conv_dropout_p: float = 0.1,
            conv_kernel_size: int = 31,
            half_step_residual: bool = True,
            device: torch.device = 'cuda',
    ):
        super(ConformerBlock, self).__init__()
        self.device = device
        if half_step_residual:
            self.feed_forward_residual_factor = 0.5
        else:
            self.feed_forward_residual_factor = 1

        self.sequential = nn.Sequential(
            ResidualConnectionModule(
                module=FeedForwardModule(
                    encoder_dim=encoder_dim,
                    expansion_factor=feed_forward_expansion_factor,
                    dropout_p=feed_forward_dropout_p,
                    device=device,
                ),
                module_factor=self.feed_forward_residual_factor,
            ),
            ResidualConnectionModule(
                module=MultiHeadedSelfAttentionModule(
                    d_model=encoder_dim,
                    num_heads=num_attention_heads,
                    dropout_p=attention_dropout_p,
                ),
            ),
            ResidualConnectionModule(
                module=ConformerConvModule(
                    in_channels=encoder_dim,
                    kernel_size=conv_kernel_size,
                    expansion_factor=conv_expansion_factor,
                    dropout_p=conv_dropout_p,
                ),
            ),
            ResidualConnectionModule(
                module=FeedForwardModule(
                    encoder_dim=encoder_dim,
                    expansion_factor=feed_forward_expansion_factor,
                    dropout_p=feed_forward_dropout_p,
                ),
                module_factor=self.feed_forward_residual_factor,
            ),
            LayerNorm(encoder_dim),
        )
Esempio n. 6
0
 def __init__(self,
              d_model: int,
              num_heads: int,
              dropout_p: float = 0.1,
              device: torch.device = 'cuda'):
     super(MultiHeadedSelfAttentionModule, self).__init__()
     self.positional_encoding = PositionalEncoding(d_model)
     self.layer_norm = LayerNorm(d_model)
     self.attention = RelativeMultiHeadAttention(d_model, num_heads,
                                                 dropout_p)
     self.dropout = nn.Dropout(p=dropout_p)
     self.device = device
Esempio n. 7
0
 def __init__(self,
              encoder_dim: int = 512,
              expansion_factor: int = 4,
              dropout_p: float = 0.1,
              device: torch.device = 'cuda') -> None:
     super(FeedForwardModule, self).__init__()
     self.device = device
     self.sequential = nn.Sequential(
         LayerNorm(encoder_dim),
         Linear(encoder_dim, encoder_dim * expansion_factor, bias=True),
         Swish(),
         nn.Dropout(p=dropout_p),
         Linear(encoder_dim * expansion_factor, encoder_dim, bias=True),
         nn.Dropout(p=dropout_p),
     )
Esempio n. 8
0
 def __init__(self, d_model: int = 512, input_dim: int = 80, d_ff: int = 2048,
              num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff',
              dropout_p: float = 0.3, pad_id: int = 0) -> None:
     super(TransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.pos_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList(
         [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers)]
     )
Esempio n. 9
0
    def __init__(
        self,
        in_channels: int,
        kernel_size: int = 31,
        expansion_factor: int = 2,
        dropout_p: float = 0.1,
        device: torch.device = 'cuda',
    ) -> None:
        super(ConformerConvModule, self).__init__()
        assert (
            kernel_size - 1
        ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
        assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2"

        self.device = device
        self.sequential = nn.Sequential(
            LayerNorm(in_channels),
            Transpose(shape=(1, 2)),
            PointwiseConv1d(in_channels,
                            in_channels * expansion_factor,
                            stride=1,
                            padding=0,
                            bias=True),
            GLU(dim=1),
            DepthwiseConv1d(in_channels,
                            in_channels,
                            kernel_size,
                            stride=1,
                            padding=(kernel_size - 1) // 2),
            nn.BatchNorm1d(in_channels),
            Swish(),
            PointwiseConv1d(in_channels,
                            in_channels,
                            stride=1,
                            padding=0,
                            bias=True),
            nn.Dropout(p=dropout_p),
        )
Esempio n. 10
0
    def __init__(
            self,
            num_classes: int,  # number of classes
            d_model: int = 512,  # dimension of model
            d_ff: int = 512,  # dimension of feed forward network
            num_layers: int = 6,  # number of decoder layers
            num_heads: int = 8,  # number of attention heads
            dropout_p: float = 0.3,  # probability of dropout
            pad_id: int = 0,  # identification of pad token
            sos_id: int = 1,  # identification of start of sentence token
            eos_id: int = 2,  # identification of end of sentence token
            max_length: int = 400,  # max length of decoding
    ) -> None:
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_length = max_length
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id

        self.embedding = Embedding(num_classes, pad_id, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.input_dropout = nn.Dropout(p=dropout_p)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_ff,
                dropout_p=dropout_p,
            ) for _ in range(num_layers)
        ])
        self.fc = nn.Sequential(
            LayerNorm(d_model),
            Linear(d_model, num_classes, bias=False),
        )
Esempio n. 11
0
 def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None:
     super(AddNorm, self).__init__()
     self.sublayer = sublayer
     self.layer_norm = LayerNorm(d_model)