Example #1
0
    def __init__(
            self,
            input_size: int,                       # size of input
            hidden_dim: int = 512,                 # dimension of RNN`s hidden state
            device: torch.device = 'cuda',         # device - 'cuda' or 'cpu'
            dropout_p: float = 0.3,                # dropout probability
            num_layers: int = 3,                   # number of RNN layers
            bidirectional: bool = True,            # if True, becomes a bidirectional encoder
            rnn_type: str = 'lstm',                # type of RNN cell
            extractor: str = 'vgg',                # type of CNN extractor
            activation: str = 'hardtanh',          # type of activation function
            mask_conv: bool = False                # flag indication whether apply mask convolution or not
    ) -> None:
        self.mask_conv = mask_conv
        self.extractor = extractor.lower()
        if self.extractor == 'vgg':
            input_size = (input_size - 1) << 5 if input_size % 2 else input_size << 5
            super(Listener, self).__init__(
                input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device
            )
            self.conv = VGGExtractor(activation, mask_conv)

        elif self.extractor == 'ds2':
            input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
            input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
            input_size <<= 5
            super(Listener, self).__init__(
                input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device
            )
            self.conv = DeepSpeech2Extractor(activation, mask_conv)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))
Example #2
0
    def __init__(
            self,
            input_size: int,  # size of input
            num_classes: int,  # number of class
            hidden_dim: int = 512,  # dimension of RNN`s hidden state
            device: str = 'cuda',  # device - 'cuda' or 'cpu'
            dropout_p: float = 0.3,  # dropout probability
            num_layers: int = 3,  # number of RNN layers
            bidirectional:
        bool = True,  # if True, becomes a bidirectional encoder
            rnn_type: str = 'lstm',  # type of RNN cell
            extractor: str = 'vgg',  # type of CNN extractor
            activation: str = 'hardtanh',  # type of activation function
            mask_conv:
        bool = False,  # flag indication whether apply mask convolution or not
            joint_ctc_attention:
        bool = False,  # Use CTC Loss & Cross Entropy Joint Learning
    ) -> None:
        self.mask_conv = mask_conv
        self.extractor = extractor.lower()
        self.joint_ctc_attention = joint_ctc_attention

        if self.extractor == 'vgg':
            input_size = (input_size -
                          1) << 5 if input_size % 2 else input_size << 5
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = VGGExtractor(activation, mask_conv)

        elif self.extractor == 'ds2':
            input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
            input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
            input_size <<= 6
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = DeepSpeech2Extractor(activation, mask_conv)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        if self.joint_ctc_attention:
            assert self.mask_conv, "if joint_ctc_attention training, mask_conv should be True"
            self.fc = nn.Sequential(
                nn.BatchNorm1d(self.hidden_dim << 1), Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(self.hidden_dim << 1, num_classes, bias=False))
Example #3
0
    def __init__(
        self,
        input_size: int,  # size of input
        num_classes: int,  # number of classfication
        rnn_type='gru',  # type of RNN cell
        num_rnn_layers: int = 5,  # number of RNN layers
        rnn_hidden_dim: int = 512,  # dimension of RNN`s hidden state
        dropout_p: float = 0.1,  # dropout probability
        bidirectional: bool = True,  # if True, becomes a bidirectional rnn
        activation: str = 'hardtanh',  # type of activation function
        device: torch.device = 'cuda'  # device - 'cuda' or 'cpu'
    ):
        super(DeepSpeech2, self).__init__()
        self.rnn_layers = list()
        self.device = device

        input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
        input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
        input_size <<= 5
        rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim

        self.conv = DeepSpeech2Extractor(activation, mask_conv=True)

        for idx in range(num_rnn_layers):
            self.rnn_layers.append(
                BNReluRNN(
                    input_size=input_size if idx == 0 else rnn_output_size,
                    hidden_dim=rnn_hidden_dim,
                    rnn_type=rnn_type,
                    bidirectional=bidirectional,
                    dropout_p=dropout_p,
                    device=device))

        self.fc = nn.Sequential(
            Linear(rnn_output_size, rnn_hidden_dim), nn.ReLU(),
            Linear(rnn_hidden_dim, num_classes, bias=False))
Example #4
0
    def __init__(
            self,
            num_classes: int,                       # the number of classfication
            d_model: int = 512,                     # dimension of model
            input_dim: int = 80,                    # dimension of input
            pad_id: int = 0,                        # identification of <PAD_token>
            sos_id: int = 1,                        # identification of <SOS_token>
            eos_id: int = 2,                        # identification of <EOS_token>
            d_ff: int = 2048,                       # dimension of feed forward network
            num_heads: int = 8,                     # number of attention heads
            num_encoder_layers: int = 6,            # number of encoder layers
            num_decoder_layers: int = 6,            # number of decoder layers
            dropout_p: float = 0.3,                 # dropout probability
            ffnet_style: str = 'ff',                # feed forward network style 'ff' or 'conv'
            extractor: str = 'vgg',                 # CNN extractor [vgg, ds2]
            joint_ctc_attention: bool = False,      # flag indication whether to apply joint ctc attention
            max_length: int = 400                   # a maximum allowed length for the sequence to be processed
    ) -> None:
        super(SpeechTransformer, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        self.num_classes = num_classes
        self.extractor = extractor
        self.joint_ctc_attention = joint_ctc_attention
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.pad_id = pad_id
        self.max_length = max_length

        if self.extractor == 'vgg':
            input_dim = (input_dim - 1) << 5 if input_dim % 2 else input_dim << 5
            self.conv = VGGExtractor(mask_conv=False)

        elif self.extractor == 'ds2':
            input_dim = int(math.floor(input_dim + 2 * 20 - 41) / 2 + 1)
            input_dim = int(math.floor(input_dim + 2 * 10 - 21) / 2 + 1)
            input_dim <<= 6
            self.conv = DeepSpeech2Extractor(mask_conv=False)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        self.encoder = SpeechTransformerEncoder(
            d_model=d_model,
            input_dim=input_dim,
            d_ff=d_ff,
            num_layers=num_encoder_layers,
            num_heads=num_heads,
            ffnet_style=ffnet_style,
            dropout_p=dropout_p,
            pad_id=pad_id,
        )

        if self.joint_ctc_attention:
            self.encoder_fc = nn.Sequential(
                nn.BatchNorm1d(d_model),
                Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(d_model, num_classes, bias=False),
            )

        self.decoder = SpeechTransformerDecoder(
            num_classes=num_classes,
            d_model=d_model,
            d_ff=d_ff,
            num_layers=num_decoder_layers,
            num_heads=num_heads,
            ffnet_style=ffnet_style,
            dropout_p=dropout_p,
            pad_id=pad_id,
            eos_id=eos_id,
        )
        self.decoder_fc = Linear(d_model, num_classes)