Ejemplo n.º 1
0
    def __init__(
        self,
        input_dim: int,
        extractor: str = 'vgg',
        d_model: int = None,
        num_classes: int = None,
        dropout_p: float = None,
        activation: str = 'hardtanh',
        joint_ctc_attention: bool = False,
    ) -> None:
        super(BaseEncoder, self).__init__()
        if joint_ctc_attention:
            assert num_classes, "If `joint_ctc_attention` True, `num_classes` should be not None"
            assert dropout_p, "If `joint_ctc_attention` True, `dropout_p` should be not None"
            assert d_model, "If `joint_ctc_attention` True, `d_model` should be not None"

        if extractor is not None:
            extractor = self.supported_extractors[extractor.lower()]
            self.conv = extractor(input_dim=input_dim, activation=activation)

        self.conv_output_dim = self.conv.get_output_dim()
        self.num_classes = num_classes
        self.joint_ctc_attention = joint_ctc_attention

        if self.joint_ctc_attention:
            self.fc = nn.Sequential(
                nn.BatchNorm1d(d_model),
                Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(d_model, num_classes, bias=False),
            )
Ejemplo n.º 2
0
    def __init__(
            self,
            input_size: int,  # size of input
            num_classes: int,  # number of class
            hidden_dim: int = 512,  # dimension of RNN`s hidden state
            device: str = 'cuda',  # device - 'cuda' or 'cpu'
            dropout_p: float = 0.3,  # dropout probability
            num_layers: int = 3,  # number of RNN layers
            bidirectional:
        bool = True,  # if True, becomes a bidirectional encoder
            rnn_type: str = 'lstm',  # type of RNN cell
            extractor: str = 'vgg',  # type of CNN extractor
            activation: str = 'hardtanh',  # type of activation function
            mask_conv:
        bool = False,  # flag indication whether apply mask convolution or not
            joint_ctc_attention:
        bool = False,  # Use CTC Loss & Cross Entropy Joint Learning
    ) -> None:
        self.mask_conv = mask_conv
        self.extractor = extractor.lower()
        self.joint_ctc_attention = joint_ctc_attention

        if self.extractor == 'vgg':
            input_size = (input_size -
                          1) << 5 if input_size % 2 else input_size << 5
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = VGGExtractor(activation, mask_conv)

        elif self.extractor == 'ds2':
            input_size = int(math.floor(input_size + 2 * 20 - 41) / 2 + 1)
            input_size = int(math.floor(input_size + 2 * 10 - 21) / 2 + 1)
            input_size <<= 6
            super(Listener,
                  self).__init__(input_size, hidden_dim, num_layers, rnn_type,
                                 dropout_p, bidirectional, device)
            self.conv = DeepSpeech2Extractor(activation, mask_conv)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        if self.joint_ctc_attention:
            assert self.mask_conv, "if joint_ctc_attention training, mask_conv should be True"
            self.fc = nn.Sequential(
                nn.BatchNorm1d(self.hidden_dim << 1), Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(self.hidden_dim << 1, num_classes, bias=False))
Ejemplo n.º 3
0
    def __init__(
        self,
        in_channels: int,
        kernel_size: int = 31,
        expansion_factor: int = 2,
        dropout_p: float = 0.1,
        device: torch.device = 'cuda',
    ) -> None:
        super(ConformerConvModule, self).__init__()
        assert (
            kernel_size - 1
        ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
        assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2"

        self.device = device
        self.sequential = nn.Sequential(
            LayerNorm(in_channels),
            Transpose(shape=(1, 2)),
            PointwiseConv1d(in_channels,
                            in_channels * expansion_factor,
                            stride=1,
                            padding=0,
                            bias=True),
            GLU(dim=1),
            DepthwiseConv1d(in_channels,
                            in_channels,
                            kernel_size,
                            stride=1,
                            padding=(kernel_size - 1) // 2),
            nn.BatchNorm1d(in_channels),
            Swish(),
            PointwiseConv1d(in_channels,
                            in_channels,
                            stride=1,
                            padding=0,
                            bias=True),
            nn.Dropout(p=dropout_p),
        )
Ejemplo n.º 4
0
    def __init__(
            self,
            num_classes: int,                       # the number of classfication
            d_model: int = 512,                     # dimension of model
            input_dim: int = 80,                    # dimension of input
            pad_id: int = 0,                        # identification of <PAD_token>
            sos_id: int = 1,                        # identification of <SOS_token>
            eos_id: int = 2,                        # identification of <EOS_token>
            d_ff: int = 2048,                       # dimension of feed forward network
            num_heads: int = 8,                     # number of attention heads
            num_encoder_layers: int = 6,            # number of encoder layers
            num_decoder_layers: int = 6,            # number of decoder layers
            dropout_p: float = 0.3,                 # dropout probability
            ffnet_style: str = 'ff',                # feed forward network style 'ff' or 'conv'
            extractor: str = 'vgg',                 # CNN extractor [vgg, ds2]
            joint_ctc_attention: bool = False,      # flag indication whether to apply joint ctc attention
            max_length: int = 400                   # a maximum allowed length for the sequence to be processed
    ) -> None:
        super(SpeechTransformer, self).__init__()

        assert d_model % num_heads == 0, "d_model % num_heads should be zero."

        self.num_classes = num_classes
        self.extractor = extractor
        self.joint_ctc_attention = joint_ctc_attention
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.pad_id = pad_id
        self.max_length = max_length

        if self.extractor == 'vgg':
            input_dim = (input_dim - 1) << 5 if input_dim % 2 else input_dim << 5
            self.conv = VGGExtractor(mask_conv=False)

        elif self.extractor == 'ds2':
            input_dim = int(math.floor(input_dim + 2 * 20 - 41) / 2 + 1)
            input_dim = int(math.floor(input_dim + 2 * 10 - 21) / 2 + 1)
            input_dim <<= 6
            self.conv = DeepSpeech2Extractor(mask_conv=False)

        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

        self.encoder = SpeechTransformerEncoder(
            d_model=d_model,
            input_dim=input_dim,
            d_ff=d_ff,
            num_layers=num_encoder_layers,
            num_heads=num_heads,
            ffnet_style=ffnet_style,
            dropout_p=dropout_p,
            pad_id=pad_id,
        )

        if self.joint_ctc_attention:
            self.encoder_fc = nn.Sequential(
                nn.BatchNorm1d(d_model),
                Transpose(shape=(1, 2)),
                nn.Dropout(dropout_p),
                Linear(d_model, num_classes, bias=False),
            )

        self.decoder = SpeechTransformerDecoder(
            num_classes=num_classes,
            d_model=d_model,
            d_ff=d_ff,
            num_layers=num_decoder_layers,
            num_heads=num_heads,
            ffnet_style=ffnet_style,
            dropout_p=dropout_p,
            pad_id=pad_id,
            eos_id=eos_id,
        )
        self.decoder_fc = Linear(d_model, num_classes)