def __init__(self, encoder: tf.keras.Model, num_classes: int, speech_config, name="ctc_model", **kwargs): super(CtcModel, self).__init__(name=name, **kwargs) self.encoder = encoder # Fully connected layer self.speech_config=speech_config self.mel_layer=None if speech_config['use_mel_layer']: if speech_config['mel_layer_type']=='Melspectrogram': self.mel_layer=Melspectrogram(sr=speech_config['sample_rate'],n_mels=speech_config['num_feature_bins'], n_hop=int(speech_config['stride_ms']*speech_config['sample_rate']//1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel'] ) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel'] ) self.mel_layer.trainable=speech_config['trainable_kernel'] self.wav_info=speech_config['add_wav_info'] if self.wav_info: assert speech_config['use_mel_layer']==True,'shold set use_mel_layer is True' self.fc = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(units=num_classes, activation="linear", use_bias=True), name="fully_connected") self.recognize_pb=None
def __init__(self, encoder, config, training, enable_tflite_convertible=False,speech_config=dict, **kwargs): super().__init__(self, **kwargs) self.encoder = encoder self.decoder_cell = DecoderCell( config, training=training, name="decoder_cell", enable_tflite_convertible=enable_tflite_convertible ) self.decoder = LASDecoder( self.decoder_cell, TrainingSampler(config) if training is True else TestingSampler(config), enable_tflite_convertible=enable_tflite_convertible ) self.config = config self.speech_config = speech_config self.mel_layer = None if speech_config['use_mel_layer']: if speech_config['mel_layer_type'] == 'Melspectrogram': self.mel_layer = Melspectrogram(sr=speech_config['sample_rate'], n_mels=speech_config['num_feature_bins'], n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel'] ) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate']//1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel'] ) self.use_window_mask = False self.maximum_iterations = 1000 if training else 50 self.enable_tflite_convertible = enable_tflite_convertible
def __init__(self, encoder1, encoder2, encoder3, classes1, classes2, classes3, dmodel, speech_config=dict, **kwargs): super().__init__(self, **kwargs) self.encoder1 = encoder1 self.encoder2 = encoder2 self.encoder3 = encoder3 self.speech_config = speech_config self.mel_layer = None if speech_config['use_mel_layer']: if speech_config['mel_layer_type'] == 'Melspectrogram': self.mel_layer = Melspectrogram( sr=speech_config['sample_rate'], n_mels=speech_config['num_feature_bins'], n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel']) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel']) self.mel_layer.trainable = speech_config['trainable_kernel'] self.wav_info = speech_config['add_wav_info'] if self.wav_info: assert speech_config[ 'use_mel_layer'] == True, 'shold set use_mel_layer is True' self.fc1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes1, activation="linear", use_bias=True), name="fully_connected1") self.fc2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes2, activation="linear", use_bias=True), name="fully_connected2") self.fc3 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes3, activation="linear", use_bias=True), name="fully_connected3") self.fc_to_project_1 = tf.keras.layers.Dense( dmodel, name='word_prob_projector') self.fc_to_project_2 = tf.keras.layers.Dense( dmodel, name='phone_prob_projector') self.fc_to_project_3 = tf.keras.layers.Dense(dmodel, name='py_prob_projector') self.fc_final_class = tf.keras.layers.Conv1D(classes3, 32, padding='same', name="cnn_final_class")
def __init__(self, encoder: tf.keras.Model, vocabulary_size: int, embed_dim: int = 512, embed_dropout: float = 0, num_lstms: int = 1, lstm_units: int = 320, joint_dim: int = 1024, name="transducer", speech_config=dict, **kwargs): super(Transducer, self).__init__(name=name, **kwargs) self.encoder = encoder self.predict_net = TransducerPrediction( vocabulary_size=vocabulary_size, embed_dim=embed_dim, embed_dropout=embed_dropout, num_lstms=num_lstms, lstm_units=lstm_units, name=f"{name}_prediction") self.joint_net = TransducerJoint(vocabulary_size=vocabulary_size, joint_dim=joint_dim, name=f"{name}_joint") self.speech_config = speech_config self.mel_layer = None if speech_config['use_mel_layer']: if speech_config['mel_layer_type'] == 'Melspectrogram': self.mel_layer = Melspectrogram( sr=speech_config['sample_rate'], n_mels=speech_config['num_feature_bins'], n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel']) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel']) self.mel_layer.trainable = speech_config['trainable_kernel'] self.ctc_classes = tf.keras.layers.Dense(vocabulary_size, name='ctc_classes') self.wav_info = speech_config['add_wav_info'] if self.wav_info: assert speech_config[ 'use_mel_layer'] == True, 'shold set use_mel_layer is True' self.kept_decode = None self.startid = 0 self.endid = 1 self.max_iter = 10
def __init__(self, encoder: tf.keras.Model, vocabulary_size: int, embed_dim: int = 512, embed_dropout: float = 0, num_lstms: int = 1, lstm_units: int = 320, joint_dim: int = 1024, name="transducer", speech_config=dict, **kwargs): super(Transducer, self).__init__(name=name, **kwargs) self.encoder = encoder self.num_lstms = num_lstms self.predict_net = TransducerPrediction( vocabulary_size=vocabulary_size, embed_dim=embed_dim, embed_dropout=embed_dropout, num_lstms=num_lstms, lstm_units=lstm_units, name=f"{name}_prediction" ) self.joint_net = TransducerJoint( vocabulary_size=vocabulary_size, joint_dim=joint_dim, name=f"{name}_joint" ) self.speech_config = speech_config self.mel_layer = None if speech_config['use_mel_layer']: if speech_config['mel_layer_type'] == 'Melspectrogram': self.mel_layer = Melspectrogram(sr=speech_config['sample_rate'], n_mels=speech_config['num_feature_bins'], n_hop=int( speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel'] ) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel'] ) self.mel_layer.trainable = speech_config['trainable_kernel'] self.ctc_classes = tf.keras.layers.Dense(vocabulary_size, name='ctc_classes') self.wav_info = speech_config['add_wav_info'] if self.wav_info: assert speech_config['use_mel_layer'] == True, 'shold set use_mel_layer is True' self.dmodel = encoder.dmodel self.chunk_size = int(self.speech_config['sample_rate'] * self.speech_config['streaming_bucket']) self.decode_layer = ConformerBlock(self.dmodel, self.encoder.dropout, self.encoder.fc_factor, self.encoder.head_size, self.encoder.num_heads, name='decode_conformer_block') self.recognize_pb = None self.encoder.add_chunk_size(self.chunk_size, speech_config['num_feature_bins'],int( speech_config['stride_ms'] * speech_config['sample_rate'] // 1000)) self.streaming = self.speech_config['streaming']
def __init__(self, encoder, classes1, classes2, classes3, config, training, enable_tflite_convertible=False, speech_config=dict, **kwargs): super().__init__(self, **kwargs) self.encoder = encoder self.speech_config = speech_config self.mel_layer = None if speech_config['use_mel_layer']: if speech_config['mel_layer_type'] == 'Melspectrogram': self.mel_layer = Melspectrogram( sr=speech_config['sample_rate'], n_mels=speech_config['num_feature_bins'], n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_fb=speech_config['trainable_kernel']) else: self.mel_layer = Spectrogram( n_hop=int(speech_config['stride_ms'] * speech_config['sample_rate'] // 1000), n_dft=1024, trainable_kernel=speech_config['trainable_kernel']) self.fc1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes1, activation="linear", use_bias=True), name="fully_connected1") self.fc2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes2, activation="linear", use_bias=True), name="fully_connected2") self.fc3 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense( units=classes3, activation="linear", use_bias=True), name="fully_connected3") self.fc_final = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(units=config.n_classes, activation="linear", use_bias=True), name="fully_connected4") self.decoder_cell = DecoderCell( config, training=training, name="decoder_cell", enable_tflite_convertible=enable_tflite_convertible) self.decoder = LASDecoder( self.decoder_cell, TrainingSampler(config) if training is True else TestingSampler(config), enable_tflite_convertible=enable_tflite_convertible) self.decoder_project = tf.keras.layers.Dense(config.decoder_lstm_units) self.token_project = tf.keras.Sequential([ ConformerBlock(config.decoder_lstm_units, dropout=config.dropout, fc_factor=config.fc_factor, head_size=config.head_size, num_heads=config.num_heads, kernel_size=config.kernel_size, name='block%d' % i) for i in range(config.n_lstm_decoder + 1) ]) self.config = config self.use_window_mask = False self.maximum_iterations = 1000 if training else 50 self.enable_tflite_convertible = enable_tflite_convertible