Esempio n. 1
0
 def __init__(
     self,
     num_classes: int,  # number of classes
     d_model: int = 512,  # dimension of model
     d_ff: int = 512,  # dimension of feed forward network
     num_layers: int = 6,  # number of decoder layers
     num_heads: int = 8,  # number of attention heads
     ffnet_style: str = 'ff',  # style of feed forward network
     dropout_p: float = 0.3,  # probability of dropout
     pad_id: int = 0,  # identification of pad token
     eos_id: int = 2  # identification of end of sentence token
 ) -> None:
     super(SpeechTransformerDecoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.embedding = Embedding(num_classes, pad_id, d_model)
     self.positional_encoding = PositionalEncoding(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.layers = nn.ModuleList([
         SpeechTransformerDecoderLayer(d_model, num_heads, d_ff, dropout_p,
                                       ffnet_style)
         for _ in range(num_layers)
     ])
     self.pad_id = pad_id
     self.eos_id = eos_id
Esempio n. 2
0
 def __init__(
         self,
         d_model: int = 512,  # dimension of model
         input_dim: int = 80,  # dimension of feature vector
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         ffnet_style: str = 'ff',  # style of feed forward network [ff, conv]
         dropout_p: float = 0.3,  # probability of dropout
         pad_id: int = 0,  # identification of pad token
 ) -> None:
     super(SpeechTransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         SpeechTransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p,
                                       ffnet_style)
         for _ in range(num_layers)
     ])
Esempio n. 3
0
 def __init__(
         self,
         input_dim: int,  # dimension of feature vector
         extractor: str = 'vgg',  # convolutional extractor
         d_model: int = 512,  # dimension of model
         d_ff: int = 2048,  # dimension of feed forward network
         num_layers: int = 6,  # number of encoder layers
         num_heads: int = 8,  # number of attention heads
         dropout_p: float = 0.3,  # probability of dropout
         joint_ctc_attention:
     bool = False,  # use CTC Loss & Cross Entropy Joint Learning
         num_classes: int = None,  # number of classification
 ) -> None:
     super(TransformerEncoder,
           self).__init__(input_dim=input_dim,
                          extractor=extractor,
                          d_model=d_model,
                          num_classes=num_classes,
                          dropout_p=dropout_p,
                          joint_ctc_attention=joint_ctc_attention)
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.input_proj = Linear(self.conv_output_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.positional_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList([
         TransformerEncoderLayer(
             d_model=d_model,
             num_heads=num_heads,
             d_ff=d_ff,
             dropout_p=dropout_p,
         ) for _ in range(num_layers)
     ])
Esempio n. 4
0
 def __init__(self,
              d_model: int,
              num_heads: int,
              dropout_p: float = 0.1,
              device: torch.device = 'cuda'):
     super(MultiHeadedSelfAttentionModule, self).__init__()
     self.positional_encoding = PositionalEncoding(d_model)
     self.layer_norm = LayerNorm(d_model)
     self.attention = RelativeMultiHeadAttention(d_model, num_heads,
                                                 dropout_p)
     self.dropout = nn.Dropout(p=dropout_p)
     self.device = device
Esempio n. 5
0
 def __init__(self, d_model: int = 512, input_dim: int = 80, d_ff: int = 2048,
              num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff',
              dropout_p: float = 0.3, pad_id: int = 0) -> None:
     super(TransformerEncoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.pad_id = pad_id
     self.input_proj = Linear(input_dim, d_model)
     self.input_layer_norm = LayerNorm(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.pos_encoding = PositionalEncoding(d_model)
     self.layers = nn.ModuleList(
         [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers)]
     )
Esempio n. 6
0
 def __init__(self, num_classes: int, d_model: int = 512, d_ff: int = 512,
              num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff',
              dropout_p: float = 0.3, pad_id: int = 0) -> None:
     super(TransformerDecoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.embedding = Embedding(num_classes, pad_id, d_model)
     self.pos_encoding = PositionalEncoding(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.layers = nn.ModuleList(
         [TransformerDecoderLayer(d_model, num_heads, d_ff,  dropout_p, ffnet_style) for _ in range(num_layers)]
     )
     self.pad_id = pad_id
     self.logit_scale = (d_model ** -0.5)
Esempio n. 7
0
    def __init__(
            self,
            num_classes: int,  # number of classes
            d_model: int = 512,  # dimension of model
            d_ff: int = 512,  # dimension of feed forward network
            num_layers: int = 6,  # number of decoder layers
            num_heads: int = 8,  # number of attention heads
            dropout_p: float = 0.3,  # probability of dropout
            pad_id: int = 0,  # identification of pad token
            sos_id: int = 1,  # identification of start of sentence token
            eos_id: int = 2,  # identification of end of sentence token
            max_length: int = 400,  # max length of decoding
    ) -> None:
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_length = max_length
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id

        self.embedding = Embedding(num_classes, pad_id, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.input_dropout = nn.Dropout(p=dropout_p)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_ff,
                dropout_p=dropout_p,
            ) for _ in range(num_layers)
        ])
        self.fc = nn.Sequential(
            nn.LayerNorm(d_model),
            Linear(d_model, num_classes, bias=False),
        )