def AttRNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000, rnn_func=L.LSTM): # simple LSTM sr = samplingrate iLen = inputLength inputs = L.Input((inputLength,), name='input') x = L.Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = L.Permute((2, 1, 3))(x) x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) # x = Reshape((125, 80)) (x) # keras.backend.squeeze(x, axis) x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] xFirst = L.Lambda(lambda q: q[:, -1])(x) # [b_s, vec_dim] query = L.Dense(128)(xFirst) # dot product attention attScores = L.Dot(axes=[1, 2])([query, x]) attScores = L.Softmax(name='attSoftmax')(attScores) # [b_s, seq_len] # rescale sequence attVector = L.Dot(axes=[1, 1])([attScores, x]) # [b_s, vec_dim] x = L.Dense(64, activation='relu')(attVector) x = L.Dense(32)(x) output = L.Dense(nCategories, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) return model
def attention_speech_model(num_category, sampling_rate=16000, input_length=16000): inputs = layers.Input((input_length, ), name='input') x = layers.Reshape((1, -1))(inputs) m = Melspectrogram(input_shape=(1, input_length), n_dft=1024, n_hop=128, padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_tft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='norm')(x) x = layers.Permute((2, 1, 3))(x) x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = layers.LeakyReLU()(x) x = layers.BatchNormalization()(x) x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = layers.BatchNormalization()(x) x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x) query = layers.Dense(128)(x_first) attention_scores = layers.Dot([1, 2])([query, x]) attention_scores = layers.Softmax( name='attention_softmax')(attention_scores) attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x]) x = layers.Dense(64)(attention_vector) x = layers.LeakyReLU()(x) x = layers.Dropout(0.5)(x) x = layers.Dense(32)(x) x = layers.Dropout(0.5)(x) out = layers.Dense(num_category, activation='softmax', name="output")(x) model = Model(inputs=inputs, outputs=out) return model
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3): inputs = Input((input_length,)) x = Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length), padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False) m.trainable = False x = m(x) x = Normalization2D(int_axis=0)(x) x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Lambda(lambda q: squeeze(q, -1))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) xFirst = Lambda(lambda q: q[:, -1])(x) query = Dense(128)(xFirst) attScores = Dot(axes=[1, 2])([query, x]) attScores = Softmax()(attScores) attVector = Dot(axes=[1, 1])([attScores, x]) x = Dense(64, activation='relu')(attVector) x = Dense(32)(x) output = Dense(classes, activation='softmax')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) model.summary() model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size, use_multiprocessing=False, workers=4, verbose=2) model.save('Att_RNN_Speech.model')
def Build_MelSpectrogram(Parametres_layer, input_length): mel_layer = Melspectrogram(n_dft=Parametres_layer["n_dft"], n_hop=Parametres_layer["n_hop"], input_shape=(1, input_length), padding=Parametres_layer["padding"], sr=Parametres_layer["sr"], n_mels=Parametres_layer["n_mels"], fmin=40.0, fmax=Parametres_layer["sr"] / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') mel_layer.trainable = False return mel_layer
def attRNN(): sr = 8000 inputs = Input((8000, 1), name='input') x = Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, 8000), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) # x = Reshape((125, 80)) (x) # keras.backend.squeeze(x, axis) x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = Bidirectional(LSTM(64, return_sequences=True))( x) # [b_s, seq_len, vec_dim] x = Bidirectional(LSTM(64, return_sequences=True))( x) # [b_s, seq_len, vec_dim] xFirst = Lambda(lambda q: q[:, -1])(x) # [b_s, vec_dim] query = Dense(128)(xFirst) # dot product attention attScores = Dot(axes=[1, 2])([query, x]) attScores = Softmax(name='attSoftmax')(attScores) # [b_s, seq_len] # rescale sequence attVector = Dot(axes=[1, 1])([attScores, x]) # [b_s, vec_dim] x = Dense(64, activation='relu')(attVector) x = Dense(32)(x) output = Dense(9, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) model.summary() return model
def RNN_model(N_CLASSES=2, SR=16000, DT=2.0): rnn_func = L.LSTM inputs = L.Input(shape=(1, int(SR * DT)), name='input') x = L.Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, padding='same', sr=SR, n_mels=80, fmin=40, fmax=SR / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) x = L.Permute((2, 1, 3))(x) x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = L.Bidirectional(rnn_func(64, return_sequences=True))(x) x = L.Bidirectional(rnn_func(64, return_sequences=True))(x) xFirst = L.Lambda(lambda q: q[:, -1])(x) query = L.Dense(128)(xFirst) attScores = L.Dot(axes=[1, 2])([query, x]) attScores = L.Softmax(name='attSoftmax')(attScores) attVector = L.Dot(axes=[1, 1])([attScores, x]) x = L.Dense(64, activation='relu')(attVector) x = L.Dense(32)(x) output = L.Dense(N_CLASSES, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() # model.fit(x_train,y_train,batch_size=32,epochs=5 # #callbacks=[earlystopper, checkpointer, lrate] # ) # from keras.models import load_model # model.save('test_model_RNN.h5') ##### OLD Version RNN Model ##### # i = L.Input(shape=(1,int(SR*DT)), name='input') # x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, # fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True, # trainable_fb=False, trainable_kernel=False,name='melbands')(i) # x = Normalization2D(str_axis='batch', name='batch_norm')(x) # x = L.Permute((2,1,3), name='permute')(x) # x = TimeDistributed(L.Reshape((-1,)), name='reshape')(x) # s = TimeDistributed(L.Dense(64, activation='tanh'), name='td_dense_tanh')(x) # x = L.Bidirectional(L.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s) # x = L.concatenate([s, x], axis=2, name='skip_connection') # x = L.Dense(64, activation='relu', name='dense_1_relu')(x) # x = L.MaxPooling1D(name='max_pool_1d')(x) # x = L.Dense(32, activation='relu', name='dense_2_relu')(x) # x = L.Flatten(name='flatten')(x) # x = L.Dropout(rate=0.2, name='dropout')(x) # x = L.Dense(32, activation='relu', activity_regularizer=l2(0.001),name='dense_3_relu')(x) # o = L.Dense(N_CLASSES, activation='softmax', name='softmax')(x) # model = Model(inputs=i, outputs=o, name='long_short_term_memory') # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model