Esempio n. 1
0
 def __init__(
     self,
     num_classes: int,  # number of classes
     d_model: int = 512,  # dimension of model
     d_ff: int = 512,  # dimension of feed forward network
     num_layers: int = 6,  # number of decoder layers
     num_heads: int = 8,  # number of attention heads
     ffnet_style: str = 'ff',  # style of feed forward network
     dropout_p: float = 0.3,  # probability of dropout
     pad_id: int = 0,  # identification of pad token
     eos_id: int = 2  # identification of end of sentence token
 ) -> None:
     super(SpeechTransformerDecoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.embedding = Embedding(num_classes, pad_id, d_model)
     self.positional_encoding = PositionalEncoding(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.layers = nn.ModuleList([
         SpeechTransformerDecoderLayer(d_model, num_heads, d_ff, dropout_p,
                                       ffnet_style)
         for _ in range(num_layers)
     ])
     self.pad_id = pad_id
     self.eos_id = eos_id
Esempio n. 2
0
 def __init__(self, num_classes: int, d_model: int = 512, d_ff: int = 512,
              num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff',
              dropout_p: float = 0.3, pad_id: int = 0) -> None:
     super(TransformerDecoder, self).__init__()
     self.d_model = d_model
     self.num_layers = num_layers
     self.num_heads = num_heads
     self.embedding = Embedding(num_classes, pad_id, d_model)
     self.pos_encoding = PositionalEncoding(d_model)
     self.input_dropout = nn.Dropout(p=dropout_p)
     self.layers = nn.ModuleList(
         [TransformerDecoderLayer(d_model, num_heads, d_ff,  dropout_p, ffnet_style) for _ in range(num_layers)]
     )
     self.pad_id = pad_id
     self.logit_scale = (d_model ** -0.5)
Esempio n. 3
0
    def __init__(
            self,
            num_classes: int,  # number of classes
            d_model: int = 512,  # dimension of model
            d_ff: int = 512,  # dimension of feed forward network
            num_layers: int = 6,  # number of decoder layers
            num_heads: int = 8,  # number of attention heads
            dropout_p: float = 0.3,  # probability of dropout
            pad_id: int = 0,  # identification of pad token
            sos_id: int = 1,  # identification of start of sentence token
            eos_id: int = 2,  # identification of end of sentence token
            max_length: int = 400,  # max length of decoding
    ) -> None:
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_length = max_length
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id

        self.embedding = Embedding(num_classes, pad_id, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.input_dropout = nn.Dropout(p=dropout_p)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_ff,
                dropout_p=dropout_p,
            ) for _ in range(num_layers)
        ])
        self.fc = nn.Sequential(
            nn.LayerNorm(d_model),
            Linear(d_model, num_classes, bias=False),
        )