Beispiel #1
0
    def __init__(self,
                 encoder: tf.keras.Model,
                 num_classes: int,
                 speech_config,
                 name="ctc_model",

                 **kwargs):
        super(CtcModel, self).__init__(name=name, **kwargs)
        self.encoder = encoder
        # Fully connected layer
        self.speech_config=speech_config
        self.mel_layer=None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type']=='Melspectrogram':
                self.mel_layer=Melspectrogram(sr=speech_config['sample_rate'],n_mels=speech_config['num_feature_bins'],
                                              n_hop=int(speech_config['stride_ms']*speech_config['sample_rate']//1000),
                                              n_dft=1024,
                                              trainable_fb=speech_config['trainable_kernel']
                                              )
            else:
                self.mel_layer = Spectrogram(
                                                n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000),
                                                n_dft=1024,
                                                trainable_kernel=speech_config['trainable_kernel']
                                                )
            self.mel_layer.trainable=speech_config['trainable_kernel']
        self.wav_info=speech_config['add_wav_info']
        if self.wav_info:
            assert speech_config['use_mel_layer']==True,'shold set use_mel_layer is True'

        self.fc = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(units=num_classes, activation="linear",
                                  use_bias=True), name="fully_connected")
        self.recognize_pb=None
Beispiel #2
0
    def __init__(self, encoder, config, training, enable_tflite_convertible=False,speech_config=dict, **kwargs):
        super().__init__(self, **kwargs)
        self.encoder = encoder
        self.decoder_cell = DecoderCell(
            config, training=training, name="decoder_cell",
            enable_tflite_convertible=enable_tflite_convertible
        )
        self.decoder = LASDecoder(
            self.decoder_cell,
            TrainingSampler(config) if training is True else TestingSampler(config),
            enable_tflite_convertible=enable_tflite_convertible
        )
        self.config = config
        self.speech_config = speech_config
        self.mel_layer = None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type'] == 'Melspectrogram':
                self.mel_layer = Melspectrogram(sr=speech_config['sample_rate'],
                                                n_mels=speech_config['num_feature_bins'],
                                                n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000),
                                                n_dft=1024,
                                                trainable_fb=speech_config['trainable_kernel']
                                                )
            else:
                self.mel_layer = Spectrogram(
                                             n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000),
                                             n_dft=1024,
                                             trainable_kernel=speech_config['trainable_kernel']
                                             )


        self.use_window_mask = False
        self.maximum_iterations = 1000 if training else 50
        self.enable_tflite_convertible = enable_tflite_convertible
Beispiel #3
0
    def __init__(self,
                 encoder1,
                 encoder2,
                 encoder3,
                 classes1,
                 classes2,
                 classes3,
                 dmodel,
                 speech_config=dict,
                 **kwargs):
        super().__init__(self, **kwargs)
        self.encoder1 = encoder1
        self.encoder2 = encoder2
        self.encoder3 = encoder3
        self.speech_config = speech_config
        self.mel_layer = None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type'] == 'Melspectrogram':
                self.mel_layer = Melspectrogram(
                    sr=speech_config['sample_rate'],
                    n_mels=speech_config['num_feature_bins'],
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_fb=speech_config['trainable_kernel'])
            else:
                self.mel_layer = Spectrogram(
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_kernel=speech_config['trainable_kernel'])
            self.mel_layer.trainable = speech_config['trainable_kernel']
        self.wav_info = speech_config['add_wav_info']
        if self.wav_info:
            assert speech_config[
                'use_mel_layer'] == True, 'shold set use_mel_layer is True'
        self.fc1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes1, activation="linear", use_bias=True),
                                                   name="fully_connected1")

        self.fc2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes2, activation="linear", use_bias=True),
                                                   name="fully_connected2")

        self.fc3 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes3, activation="linear", use_bias=True),
                                                   name="fully_connected3")

        self.fc_to_project_1 = tf.keras.layers.Dense(
            dmodel, name='word_prob_projector')
        self.fc_to_project_2 = tf.keras.layers.Dense(
            dmodel, name='phone_prob_projector')
        self.fc_to_project_3 = tf.keras.layers.Dense(dmodel,
                                                     name='py_prob_projector')
        self.fc_final_class = tf.keras.layers.Conv1D(classes3,
                                                     32,
                                                     padding='same',
                                                     name="cnn_final_class")
Beispiel #4
0
    def __init__(self,
                 encoder: tf.keras.Model,
                 vocabulary_size: int,
                 embed_dim: int = 512,
                 embed_dropout: float = 0,
                 num_lstms: int = 1,
                 lstm_units: int = 320,
                 joint_dim: int = 1024,
                 name="transducer",
                 speech_config=dict,
                 **kwargs):
        super(Transducer, self).__init__(name=name, **kwargs)
        self.encoder = encoder
        self.predict_net = TransducerPrediction(
            vocabulary_size=vocabulary_size,
            embed_dim=embed_dim,
            embed_dropout=embed_dropout,
            num_lstms=num_lstms,
            lstm_units=lstm_units,
            name=f"{name}_prediction")
        self.joint_net = TransducerJoint(vocabulary_size=vocabulary_size,
                                         joint_dim=joint_dim,
                                         name=f"{name}_joint")
        self.speech_config = speech_config
        self.mel_layer = None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type'] == 'Melspectrogram':
                self.mel_layer = Melspectrogram(
                    sr=speech_config['sample_rate'],
                    n_mels=speech_config['num_feature_bins'],
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_fb=speech_config['trainable_kernel'])
            else:
                self.mel_layer = Spectrogram(
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_kernel=speech_config['trainable_kernel'])
            self.mel_layer.trainable = speech_config['trainable_kernel']

        self.ctc_classes = tf.keras.layers.Dense(vocabulary_size,
                                                 name='ctc_classes')
        self.wav_info = speech_config['add_wav_info']
        if self.wav_info:
            assert speech_config[
                'use_mel_layer'] == True, 'shold set use_mel_layer is True'
        self.kept_decode = None
        self.startid = 0
        self.endid = 1
        self.max_iter = 10
Beispiel #5
0
    def __init__(self,
                 encoder: tf.keras.Model,
                 vocabulary_size: int,
                 embed_dim: int = 512,
                 embed_dropout: float = 0,
                 num_lstms: int = 1,
                 lstm_units: int = 320,
                 joint_dim: int = 1024,
                 name="transducer", speech_config=dict,
                 **kwargs):
        super(Transducer, self).__init__(name=name, **kwargs)
        self.encoder = encoder
        self.num_lstms = num_lstms
        self.predict_net = TransducerPrediction(
            vocabulary_size=vocabulary_size,
            embed_dim=embed_dim,
            embed_dropout=embed_dropout,
            num_lstms=num_lstms,
            lstm_units=lstm_units,
            name=f"{name}_prediction"
        )
        self.joint_net = TransducerJoint(
            vocabulary_size=vocabulary_size,
            joint_dim=joint_dim,
            name=f"{name}_joint"
        )

        self.speech_config = speech_config
        self.mel_layer = None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type'] == 'Melspectrogram':
                self.mel_layer = Melspectrogram(sr=speech_config['sample_rate'],
                                                n_mels=speech_config['num_feature_bins'],
                                                n_hop=int(
                                                    speech_config['stride_ms'] * speech_config['sample_rate'] // 1000),
                                                n_dft=1024,
                                                trainable_fb=speech_config['trainable_kernel']
                                                )
            else:
                self.mel_layer = Spectrogram(
                    n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_kernel=speech_config['trainable_kernel']
                )
            self.mel_layer.trainable = speech_config['trainable_kernel']

        self.ctc_classes = tf.keras.layers.Dense(vocabulary_size, name='ctc_classes')

        self.wav_info = speech_config['add_wav_info']
        if self.wav_info:
            assert speech_config['use_mel_layer'] == True, 'shold set use_mel_layer is True'

        self.dmodel = encoder.dmodel

        self.chunk_size = int(self.speech_config['sample_rate'] * self.speech_config['streaming_bucket'])
        self.decode_layer = ConformerBlock(self.dmodel, self.encoder.dropout, self.encoder.fc_factor,
                                           self.encoder.head_size,
                                           self.encoder.num_heads, name='decode_conformer_block')
        self.recognize_pb = None
        self.encoder.add_chunk_size(self.chunk_size, speech_config['num_feature_bins'],int(
                                                    speech_config['stride_ms'] * speech_config['sample_rate'] // 1000))
        self.streaming = self.speech_config['streaming']
Beispiel #6
0
    def __init__(self,
                 encoder,
                 classes1,
                 classes2,
                 classes3,
                 config,
                 training,
                 enable_tflite_convertible=False,
                 speech_config=dict,
                 **kwargs):
        super().__init__(self, **kwargs)
        self.encoder = encoder
        self.speech_config = speech_config
        self.mel_layer = None
        if speech_config['use_mel_layer']:
            if speech_config['mel_layer_type'] == 'Melspectrogram':
                self.mel_layer = Melspectrogram(
                    sr=speech_config['sample_rate'],
                    n_mels=speech_config['num_feature_bins'],
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_fb=speech_config['trainable_kernel'])
            else:
                self.mel_layer = Spectrogram(
                    n_hop=int(speech_config['stride_ms'] *
                              speech_config['sample_rate'] // 1000),
                    n_dft=1024,
                    trainable_kernel=speech_config['trainable_kernel'])
        self.fc1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes1, activation="linear", use_bias=True),
                                                   name="fully_connected1")

        self.fc2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes2, activation="linear", use_bias=True),
                                                   name="fully_connected2")

        self.fc3 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            units=classes3, activation="linear", use_bias=True),
                                                   name="fully_connected3")
        self.fc_final = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(units=config.n_classes,
                                  activation="linear",
                                  use_bias=True),
            name="fully_connected4")
        self.decoder_cell = DecoderCell(
            config,
            training=training,
            name="decoder_cell",
            enable_tflite_convertible=enable_tflite_convertible)
        self.decoder = LASDecoder(
            self.decoder_cell,
            TrainingSampler(config)
            if training is True else TestingSampler(config),
            enable_tflite_convertible=enable_tflite_convertible)
        self.decoder_project = tf.keras.layers.Dense(config.decoder_lstm_units)
        self.token_project = tf.keras.Sequential([
            ConformerBlock(config.decoder_lstm_units,
                           dropout=config.dropout,
                           fc_factor=config.fc_factor,
                           head_size=config.head_size,
                           num_heads=config.num_heads,
                           kernel_size=config.kernel_size,
                           name='block%d' % i)
            for i in range(config.n_lstm_decoder + 1)
        ])
        self.config = config
        self.use_window_mask = False
        self.maximum_iterations = 1000 if training else 50
        self.enable_tflite_convertible = enable_tflite_convertible