def __init__( self, num_classes: int, # number of classfication max_length: int = 120, # a maximum allowed length for the sequence to be processed hidden_dim: int = 1024, # dimension of RNN`s hidden state vector sos_id: int = 1, # start of sentence token`s id eos_id: int = 2, # end of sentence token`s id attn_mechanism: str = 'multi-head', # type of attention mechanism num_heads: int = 4, # number of attention heads num_layers: int = 2, # number of RNN layers rnn_type: str = 'lstm', # type of RNN cell dropout_p: float = 0.3, # dropout probability device: str = 'cuda' ) -> None: # device - 'cuda' or 'cpu' super(LanguageDecoderRNN, self).__init__(hidden_dim, hidden_dim, num_layers, rnn_type, dropout_p, False, device) self.num_classes = num_classes self.num_heads = num_heads self.max_length = max_length self.eos_id = eos_id self.sos_id = sos_id self.attn_mechanism = attn_mechanism.lower() self.embedding = nn.Embedding(num_classes, hidden_dim) self.input_dropout = nn.Dropout(dropout_p) self.attention = AddNorm(MultiHeadAttention(hidden_dim), hidden_dim) self.projection = AddNorm(Linear(hidden_dim, hidden_dim, bias=True), hidden_dim) self.generator = Linear(hidden_dim, num_classes, bias=False)
def __init__( self, dim: int = 512, num_heads: int = 16, dropout_p: float = 0.1, ) -> None: super(RelativeMultiHeadAttention, self).__init__() assert dim % num_heads == 0, "d_model % num_heads should be zero." self.dim = dim self.d_head = int(dim / num_heads) self.num_heads = num_heads self.sqrt_dim = math.sqrt(dim) self.query_proj = Linear(dim, dim) self.key_proj = Linear(dim, dim) self.value_proj = Linear(dim, dim) self.pos_proj = Linear(dim, dim, bias=False) self.dropout = nn.Dropout(p=dropout_p) self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head)) self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head)) torch.nn.init.xavier_uniform_(self.u_bias) torch.nn.init.xavier_uniform_(self.v_bias) self.out_proj = Linear(dim, dim)
def __init__(self, num_classes: int, # number of classfication max_length: int = 120, # a maximum allowed length for the sequence to be processed hidden_dim: int = 1024, # dimension of RNN`s hidden state vector sos_id: int = 1, # start of sentence token`s id eos_id: int = 2, # end of sentence token`s id attn_mechanism: str = 'multi-head', # type of attention mechanism num_heads: int = 4, # number of attention heads num_layers: int = 2, # number of RNN layers rnn_type: str = 'lstm', # type of RNN cell dropout_p: float = 0.3, # dropout probability device: str = 'cuda') -> None: # device - 'cuda' or 'cpu' super(SpeechDecoderRNN, self).__init__(hidden_dim, hidden_dim, num_layers, rnn_type, dropout_p, False, device) self.num_classes = num_classes self.num_heads = num_heads self.max_length = max_length self.eos_id = eos_id self.sos_id = sos_id self.attn_mechanism = attn_mechanism.lower() self.embedding = nn.Embedding(num_classes, hidden_dim) self.input_dropout = nn.Dropout(dropout_p) if self.attn_mechanism == 'loc': self.attention = AddNorm(LocationAwareAttention(hidden_dim, smoothing=True), hidden_dim) elif self.attn_mechanism == 'multi-head': self.attention = AddNorm(MultiHeadAttention(hidden_dim, num_heads), hidden_dim) elif self.attn_mechanism == 'additive': self.attention = AddNorm(AdditiveAttention(hidden_dim), hidden_dim) elif self.attn_mechanism == 'scaled-dot': self.attention = AddNorm(ScaledDotProductAttention(hidden_dim), hidden_dim) else: raise ValueError("Unsupported attention: %s".format(attn_mechanism)) self.projection = AddNorm(Linear(hidden_dim, hidden_dim, bias=True), hidden_dim) self.generator = Linear(hidden_dim, num_classes, bias=False)
def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout_p: float = 0.3, ffnet_style: str = 'ff') -> None: super(PositionWiseFeedForwardNet, self).__init__() self.ffnet_style = ffnet_style.lower() if self.ffnet_style == 'ff': self.feed_forward = nn.Sequential( Linear(d_model, d_ff), nn.Dropout(dropout_p), nn.ReLU(), Linear(d_ff, d_model), nn.Dropout(dropout_p), ) elif self.ffnet_style == 'conv': self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) self.relu = nn.ReLU() self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) else: raise ValueError("Unsupported mode: {0}".format(self.mode))
def __init__(self, dim: int = 1024, attn_dim: int = 1024, smoothing: bool = False) -> None: super(LocationAwareAttention, self).__init__() self.location_conv = nn.Conv1d(in_channels=1, out_channels=attn_dim, kernel_size=3, padding=1) self.query_proj = Linear(dim, attn_dim, bias=False) self.value_proj = Linear(dim, attn_dim, bias=False) self.bias = nn.Parameter(torch.rand(attn_dim).uniform_(-0.1, 0.1)) self.fc = Linear(attn_dim, 1, bias=True) self.smoothing = smoothing
def __init__( self, num_classes: int, max_length: int = 150, hidden_state_dim: int = 1024, pad_id: int = 0, sos_id: int = 1, eos_id: int = 2, attn_mechanism: str = 'multi-head', num_heads: int = 4, num_layers: int = 2, rnn_type: str = 'lstm', dropout_p: float = 0.3, ) -> None: super(DecoderRNN, self).__init__() self.hidden_state_dim = hidden_state_dim self.num_classes = num_classes self.num_heads = num_heads self.num_layers = num_layers self.max_length = max_length self.eos_id = eos_id self.sos_id = sos_id self.pad_id = pad_id self.attn_mechanism = attn_mechanism.lower() self.embedding = nn.Embedding(num_classes, hidden_state_dim) self.input_dropout = nn.Dropout(dropout_p) rnn_cell = self.supported_rnns[rnn_type.lower()] self.rnn = rnn_cell( input_size=hidden_state_dim, hidden_size=hidden_state_dim, num_layers=num_layers, bias=True, batch_first=True, dropout=dropout_p, bidirectional=False, ) if self.attn_mechanism == 'loc': self.attention = LocationAwareAttention(hidden_state_dim, attn_dim=hidden_state_dim, smoothing=False) elif self.attn_mechanism == 'multi-head': self.attention = MultiHeadAttention(hidden_state_dim, num_heads=num_heads) elif self.attn_mechanism == 'additive': self.attention = AdditiveAttention(hidden_state_dim) elif self.attn_mechanism == 'scaled-dot': self.attention = ScaledDotProductAttention(dim=hidden_state_dim) else: raise ValueError( "Unsupported attention: %s".format(attn_mechanism)) self.fc = nn.Sequential( Linear(hidden_state_dim << 1, hidden_state_dim), nn.Tanh(), View(shape=(-1, self.hidden_state_dim), contiguous=True), Linear(hidden_state_dim, num_classes), )
def __init__(self, dim: int = 512, num_heads: int = 8) -> None: super(MultiHeadAttention, self).__init__() assert dim % num_heads == 0, "hidden_dim % num_heads should be zero." self.d_head = int(dim / num_heads) self.num_heads = num_heads self.query_proj = Linear(dim, self.d_head * num_heads) self.key_proj = Linear(dim, self.d_head * num_heads) self.value_proj = Linear(dim, self.d_head * num_heads) self.scaled_dot_attn = ScaledDotProductAttention(self.d_head, scale=True)
def __init__(self, d_model: int = 512, num_heads: int = 8) -> None: super(MultiHeadAttention, self).__init__() assert d_model % num_heads == 0, "hidden_dim % num_heads should be zero." self.d_head = int(d_model / num_heads) self.num_heads = num_heads self.query_proj = Linear(d_model, self.d_head * num_heads) self.key_proj = Linear(d_model, self.d_head * num_heads) self.value_proj = Linear(d_model, self.d_head * num_heads) self.sqrt_dim = np.sqrt(d_model)
def __init__(self, d_model: int = 512, num_heads: int = 8) -> None: super(MultiHeadAttention, self).__init__() assert d_model % num_heads == 0, "d_model % num_heads should be zero." self.d_head = int(d_model / num_heads) self.num_heads = num_heads self.scaled_dot_attn = ScaledDotProductAttention(d_model) self.query_proj = Linear(d_model, self.d_head * num_heads) self.key_proj = Linear(d_model, self.d_head * num_heads) self.value_proj = Linear(d_model, self.d_head * num_heads)
def __init__(self, d_model: int = 512, smoothing: bool = True) -> None: super(LocationAwareAttention, self).__init__() self.d_model = d_model self.conv1d = nn.Conv1d(in_channels=1, out_channels=d_model, kernel_size=3, padding=1) self.query_proj = Linear(d_model, d_model, bias=False) self.value_proj = Linear(d_model, d_model, bias=False) self.bias = nn.Parameter(torch.rand(d_model).uniform_(-0.1, 0.1)) self.score_proj = Linear(d_model, 1, bias=True) self.smoothing = smoothing
def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout_p: float = 0.3) -> None: super(PositionwiseFeedForward, self).__init__() self.feed_forward = nn.Sequential( Linear(d_model, d_ff), nn.Dropout(dropout_p), nn.ReLU(), Linear(d_ff, d_model), nn.Dropout(dropout_p), )
def __init__(self, encoder_dim: int = 512, expansion_factor: int = 4, dropout_p: float = 0.1, device: torch.device = 'cuda') -> None: super(FeedForwardModule, self).__init__() self.device = device self.sequential = nn.Sequential( LayerNorm(encoder_dim), Linear(encoder_dim, encoder_dim * expansion_factor, bias=True), Swish(), nn.Dropout(p=dropout_p), Linear(encoder_dim * expansion_factor, encoder_dim, bias=True), nn.Dropout(p=dropout_p), )
def __init__( self, encoder: TransducerEncoder, decoder: TransducerDecoder, d_model: int, num_classes: int, ) -> None: super(TransducerModel, self).__init__() self.encoder = encoder self.decoder = decoder self.fc = nn.Sequential( Linear(d_model << 1, d_model), nn.Tanh(), Linear(d_model, num_classes, bias=False), )
def __init__( self, input_dim: int, # dimension of feature vector extractor: str = 'vgg', # convolutional extractor d_model: int = 512, # dimension of model d_ff: int = 2048, # dimension of feed forward network num_layers: int = 6, # number of encoder layers num_heads: int = 8, # number of attention heads dropout_p: float = 0.3, # probability of dropout joint_ctc_attention: bool = False, # use CTC Loss & Cross Entropy Joint Learning num_classes: int = None, # number of classification ) -> None: super(TransformerEncoder, self).__init__(input_dim=input_dim, extractor=extractor, d_model=d_model, num_classes=num_classes, dropout_p=dropout_p, joint_ctc_attention=joint_ctc_attention) self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.input_proj = Linear(self.conv_output_dim, d_model) self.input_layer_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.positional_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList([ TransformerEncoderLayer( d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout_p=dropout_p, ) for _ in range(num_layers) ])
def __init__(self, num_classes: int, d_model: int = 512, input_dim: int = 80, pad_id: int = 0, eos_id: int = 2, d_ff: int = 2048, num_heads: int = 8, num_encoder_layers: int = 6, num_decoder_layers: int = 6, dropout_p: float = 0.3, ffnet_style: str = 'ff') -> None: super(Transformer, self).__init__() assert d_model % num_heads == 0, "d_model % num_heads should be zero." self.eos_id = eos_id self.pad_id = pad_id self.encoder = TransformerEncoder(d_model, input_dim, d_ff, num_encoder_layers, num_heads, ffnet_style, dropout_p, pad_id) self.decoder = TransformerDecoder(num_classes, d_model, d_ff, num_decoder_layers, num_heads, ffnet_style, dropout_p, pad_id) self.generator = Linear(d_model, num_classes)
def __init__( self, input_dim: int, hidden_state_dim: int, output_dim: int, num_layers: int, rnn_type: str = 'lstm', dropout_p: float = 0.2, bidirectional: bool = True, ): super(EncoderRNNT, self).__init__() self.hidden_state_dim = hidden_state_dim rnn_cell = self.supported_rnns[rnn_type.lower()] self.rnn = rnn_cell( input_size=input_dim, hidden_size=hidden_state_dim, num_layers=num_layers, bias=True, batch_first=True, dropout=dropout_p, bidirectional=bidirectional, ) self.out_proj = Linear( hidden_state_dim << 1 if bidirectional else hidden_state_dim, output_dim)
def __init__( self, input_dim: int, extractor: str = 'vgg', d_model: int = None, num_classes: int = None, dropout_p: float = None, activation: str = 'hardtanh', joint_ctc_attention: bool = False, ) -> None: super(BaseEncoder, self).__init__() if joint_ctc_attention: assert num_classes, "If `joint_ctc_attention` True, `num_classes` should be not None" assert dropout_p, "If `joint_ctc_attention` True, `dropout_p` should be not None" assert d_model, "If `joint_ctc_attention` True, `d_model` should be not None" if extractor is not None: extractor = self.supported_extractors[extractor.lower()] self.conv = extractor(input_dim=input_dim, activation=activation) self.conv_output_dim = self.conv.get_output_dim() self.num_classes = num_classes self.joint_ctc_attention = joint_ctc_attention if self.joint_ctc_attention: self.fc = nn.Sequential( nn.BatchNorm1d(d_model), Transpose(shape=(1, 2)), nn.Dropout(dropout_p), Linear(d_model, num_classes, bias=False), )
def __init__( self, input_dim: int, num_classes: int, rnn_type='gru', num_rnn_layers: int = 5, rnn_hidden_dim: int = 512, dropout_p: float = 0.1, bidirectional: bool = True, activation: str = 'hardtanh', device: torch.device = 'cuda', ): super(DeepSpeech2, self).__init__() self.device = device self.conv = DeepSpeech2Extractor(input_dim, activation=activation) self.rnn_layers = nn.ModuleList() rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim for idx in range(num_rnn_layers): self.rnn_layers.append( BNReluRNN( input_size=self.conv.get_output_dim() if idx == 0 else rnn_output_size, hidden_state_dim=rnn_hidden_dim, rnn_type=rnn_type, bidirectional=bidirectional, dropout_p=dropout_p, )) self.fc = nn.Sequential( LayerNorm(rnn_output_size), Linear(rnn_output_size, num_classes, bias=False), )
def __init__( self, d_model: int = 512, # dimension of model input_dim: int = 80, # dimension of feature vector d_ff: int = 2048, # dimension of feed forward network num_layers: int = 6, # number of encoder layers num_heads: int = 8, # number of attention heads ffnet_style: str = 'ff', # style of feed forward network [ff, conv] dropout_p: float = 0.3, # probability of dropout pad_id: int = 0, # identification of pad token ) -> None: super(SpeechTransformerEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.pad_id = pad_id self.input_proj = Linear(input_dim, d_model) self.input_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.positional_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList([ SpeechTransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers) ])
def __init__( self, num_classes: int, hidden_state_dim: int, output_dim: int, num_layers: int, rnn_type: str = 'lstm', sos_id: int = 1, eos_id: int = 2, dropout_p: float = 0.2, ): super(DecoderRNNT, self).__init__() self.hidden_state_dim = hidden_state_dim self.sos_id = sos_id self.eos_id = eos_id self.embedding = nn.Embedding(num_classes, hidden_state_dim) rnn_cell = self.supported_rnns[rnn_type.lower()] self.rnn = rnn_cell( input_size=hidden_state_dim, hidden_size=hidden_state_dim, num_layers=num_layers, bias=True, batch_first=True, dropout=dropout_p, bidirectional=False, ) self.out_proj = Linear(hidden_state_dim, output_dim)
def __init__( self, encoder: TransducerEncoder, decoder: TransducerDecoder, d_model: int, num_classes: int, ) -> None: super(TransducerModel, self).__init__() self.encoder = encoder self.decoder = decoder self.fc = Linear(d_model << 1, num_classes, bias=False)
def __init__(self, d_model: int = 512, input_dim: int = 80, d_ff: int = 2048, num_layers: int = 6, num_heads: int = 8, ffnet_style: str = 'ff', dropout_p: float = 0.3, pad_id: int = 0) -> None: super(TransformerEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.pad_id = pad_id self.input_proj = Linear(input_dim, d_model) self.input_layer_norm = LayerNorm(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.pos_encoding = PositionalEncoding(d_model) self.layers = nn.ModuleList( [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_p, ffnet_style) for _ in range(num_layers)] )
def __init__( self, input_size: int, # size of input num_classes: int, # number of classfication rnn_type='gru', # type of RNN cell num_rnn_layers: int = 5, # number of RNN layers rnn_hidden_dim: int = 512, # dimension of RNN`s hidden state dropout_p: float = 0.1, # dropout probability bidirectional: bool = True, # if True, becomes a bidirectional rnn activation: str = 'hardtanh', # type of activation function device: torch.device = 'cuda' # device - 'cuda' or 'cpu' ): super(DeepSpeech2, self).__init__() self.rnn_layers = list() self.device = device input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1) input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1) input_size <<= 5 rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim self.conv = DeepSpeech2Extractor(activation, mask_conv=True) for idx in range(num_rnn_layers): self.rnn_layers.append( BNReluRNN( input_size=input_size if idx == 0 else rnn_output_size, hidden_dim=rnn_hidden_dim, rnn_type=rnn_type, bidirectional=bidirectional, dropout_p=dropout_p, device=device)) self.fc = nn.Sequential( Linear(rnn_output_size, rnn_hidden_dim), nn.ReLU(), Linear(rnn_hidden_dim, num_classes, bias=False))
def __init__( self, input_size: int, # size of input num_classes: int, # number of class hidden_dim: int = 512, # dimension of RNN`s hidden state device: str = 'cuda', # device - 'cuda' or 'cpu' dropout_p: float = 0.3, # dropout probability num_layers: int = 3, # number of RNN layers bidirectional: bool = True, # if True, becomes a bidirectional encoder rnn_type: str = 'lstm', # type of RNN cell extractor: str = 'vgg', # type of CNN extractor activation: str = 'hardtanh', # type of activation function mask_conv: bool = False, # flag indication whether apply mask convolution or not joint_ctc_attention: bool = False, # Use CTC Loss & Cross Entropy Joint Learning ) -> None: self.mask_conv = mask_conv self.extractor = extractor.lower() self.joint_ctc_attention = joint_ctc_attention if self.extractor == 'vgg': input_size = (input_size - 1) << 5 if input_size % 2 else input_size << 5 super(Listener, self).__init__(input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device) self.conv = VGGExtractor(activation, mask_conv) elif self.extractor == 'ds2': input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1) input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1) input_size <<= 6 super(Listener, self).__init__(input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device) self.conv = DeepSpeech2Extractor(activation, mask_conv) else: raise ValueError("Unsupported Extractor : {0}".format(extractor)) if self.joint_ctc_attention: assert self.mask_conv, "if joint_ctc_attention training, mask_conv should be True" self.fc = nn.Sequential( nn.BatchNorm1d(self.hidden_dim << 1), Transpose(shape=(1, 2)), nn.Dropout(dropout_p), Linear(self.hidden_dim << 1, num_classes, bias=False))
def __init__(self, num_classes: int, # the number of classfication d_model: int = 512, # dimension of model input_dim: int = 80, # dimension of input pad_id: int = 0, # identification of <PAD_token> eos_id: int = 2, # identification of <EOS_token> d_ff: int = 2048, # dimension of feed forward network num_heads: int = 8, # number of attention heads num_encoder_layers: int = 6, # number of encoder layers num_decoder_layers: int = 6, # number of decoder layers dropout_p: float = 0.3, # dropout probability ffnet_style: str = 'ff') -> None: # feed forward network style 'ff' or 'conv' super(Transformer, self).__init__() assert d_model % num_heads == 0, "d_model % num_heads should be zero." self.eos_id = eos_id self.pad_id = pad_id self.encoder = TransformerEncoder(d_model, input_dim, d_ff, num_encoder_layers, num_heads, ffnet_style, dropout_p, pad_id) self.decoder = TransformerDecoder(num_classes, d_model, d_ff, num_decoder_layers, num_heads, ffnet_style, dropout_p, pad_id) self.generator = Linear(d_model, num_classes)
def __init__( self, input_dim: int = 80, encoder_dim: int = 512, num_layers: int = 17, num_attention_heads: int = 8, feed_forward_expansion_factor: int = 4, conv_expansion_factor: int = 2, input_dropout_p: float = 0.1, feed_forward_dropout_p: float = 0.1, attention_dropout_p: float = 0.1, conv_dropout_p: float = 0.1, conv_kernel_size: int = 31, half_step_residual: bool = True, device: torch.device = 'cuda', ): super(ConformerEncoder, self).__init__() self.conv_subsample = Conv2dSubsampling(input_dim, in_channels=1, out_channels=encoder_dim) self.input_projection = nn.Sequential( Linear(self.conv_subsample.get_output_dim(), encoder_dim), nn.Dropout(p=input_dropout_p), ) self.layers = nn.ModuleList([ ConformerBlock( encoder_dim=encoder_dim, num_attention_heads=num_attention_heads, feed_forward_expansion_factor=feed_forward_expansion_factor, conv_expansion_factor=conv_expansion_factor, feed_forward_dropout_p=feed_forward_dropout_p, attention_dropout_p=attention_dropout_p, conv_dropout_p=conv_dropout_p, conv_kernel_size=conv_kernel_size, half_step_residual=half_step_residual, device=device, ).to(device) for _ in range(num_layers) ])
def __init__( self, vocab_size: int, # size of vocab hidden_dim: int = 512, # dimension of RNN`s hidden state device: str = 'cuda', # device - 'cuda' or 'cpu' dropout_p: float = 0.3, # dropout probability num_layers: int = 3, # number of RNN layers bidirectional: bool = True, # if True, becomes a bidirectional encoder rnn_type: str = 'lstm' # type of RNN cell ) -> None: super(SpellingCorrectorEncoder, self).__init__() self.embedding = nn.Embedding(vocab_size, hidden_dim) self.layers = nn.ModuleList([ SpellingCorrectorEncoderLayer(hidden_dim=hidden_dim, device=device, dropout_p=dropout_p, num_layers=1, bidirectional=bidirectional, rnn_type=rnn_type) for _ in range(num_layers) ]) self.dropout = nn.Dropout(p=dropout_p) self.fc = Linear(hidden_dim, hidden_dim, bias=True)
def __init__( self, num_classes: int, # number of classes d_model: int = 512, # dimension of model d_ff: int = 512, # dimension of feed forward network num_layers: int = 6, # number of decoder layers num_heads: int = 8, # number of attention heads dropout_p: float = 0.3, # probability of dropout pad_id: int = 0, # identification of pad token sos_id: int = 1, # identification of start of sentence token eos_id: int = 2, # identification of end of sentence token max_length: int = 400, # max length of decoding ) -> None: super(TransformerDecoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.max_length = max_length self.pad_id = pad_id self.sos_id = sos_id self.eos_id = eos_id self.embedding = Embedding(num_classes, pad_id, d_model) self.positional_encoding = PositionalEncoding(d_model) self.input_dropout = nn.Dropout(p=dropout_p) self.layers = nn.ModuleList([ TransformerDecoderLayer( d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout_p=dropout_p, ) for _ in range(num_layers) ]) self.fc = nn.Sequential( nn.LayerNorm(d_model), Linear(d_model, num_classes, bias=False), )
def __init__( self, num_classes: int, # the number of classfication d_model: int = 512, # dimension of model input_dim: int = 80, # dimension of input pad_id: int = 0, # identification of <PAD_token> eos_id: int = 2, # identification of <EOS_token> d_ff: int = 2048, # dimension of feed forward network num_heads: int = 8, # number of attention heads num_encoder_layers: int = 6, # number of encoder layers num_decoder_layers: int = 6, # number of decoder layers dropout_p: float = 0.3, # dropout probability ffnet_style: str = 'ff', # feed forward network style 'ff' or 'conv' extractor: str = 'vgg' # CNN extractor [vgg, ds2] ) -> None: super(SpeechTransformer, self).__init__() assert d_model % num_heads == 0, "d_model % num_heads should be zero." if extractor.lower() == 'vgg': input_dim = (input_dim - 1) << 5 if input_dim % 2 else input_dim << 5 self.conv = nn.Sequential( nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(num_features=64), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(num_features=64), nn.Hardtanh(0, 20, inplace=True), nn.MaxPool2d(2, stride=2), nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(num_features=128), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(num_features=128), nn.Hardtanh(0, 20, inplace=True), nn.MaxPool2d(2, stride=2)) elif extractor.lower() == 'ds2': input_dim = int(math.floor(input_dim + 2 * 20 - 41) / 2 + 1) input_dim = int(math.floor(input_dim + 2 * 10 - 21) / 2 + 1) input_dim <<= 5 self.conv = nn.Sequential( nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5), bias=False), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5), bias=False), nn.BatchNorm2d(32), nn.Hardtanh(0, 20, inplace=True), ) else: raise ValueError("Unsupported Extractor : {0}".format(extractor)) self.encoder = SpeechTransformerEncoder(d_model=d_model, input_dim=input_dim, d_ff=d_ff, num_layers=num_encoder_layers, num_heads=num_heads, ffnet_style=ffnet_style, dropout_p=dropout_p, pad_id=pad_id) self.decoder = SpeechTransformerDecoder(num_classes=num_classes, d_model=d_model, d_ff=d_ff, num_layers=num_decoder_layers, num_heads=num_heads, ffnet_style=ffnet_style, dropout_p=dropout_p, pad_id=pad_id, eos_id=eos_id) self.eos_id = eos_id self.pad_id = pad_id self.generator = Linear(d_model, num_classes)
def __init__(self, d_model: int) -> None: super(AdditiveAttention, self).__init__() self.query_proj = Linear(d_model, d_model, bias=False) self.key_proj = Linear(d_model, d_model, bias=False) self.bias = nn.Parameter(torch.rand(d_model).uniform_(-0.1, 0.1)) self.score_proj = Linear(d_model, 1)