def model_mfcc_layer(x_train, num_labels): sr = 22050 model_input = x = Input(shape=x_train[0].shape) x = Melspectrogram(n_dft=512, n_hop=sr // 128 + 1, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr / 2, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='trainable_stft')(x) # x = Spectrogram(n_dft=512, n_hop=sr // 128 + 1, # return_decibel_spectrogram=False, power_spectrogram=2.0, # trainable_kernel=False, name='static_stft')(x) # x = Normalization2D(str_axis='freq')(x) # x = AdditiveNoise(power=0.3)(x) x = Conv2D(filters=16, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=32, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=64, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=128, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=256, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = AveragePooling2D(pool_size=(int(x.get_shape()[1]), int(x.get_shape()[2])))(x) x = Conv2D(filters=num_labels, kernel_size=1, padding='valid', activation='softmax' if num_labels > 1 else 'relu')(x) model = Model(inputs=[model_input], outputs=[x]) model.summary() return model
def build_model_vggish(classes, dropout_final=0.2, shape=(None, 320000), sr=16000, rnn_type='gru', rnn_units=256, focal_alpha=0.95, rnn_layers=1, rnn_dropout=0.2, activation='elu', random_noise=0.2, weights='soundnet'): inputs = keras.Input(shape=shape[1:]) x = keras.layers.Reshape(target_shape=(1, -1))(inputs) x = Melspectrogram(n_dft=512, n_hop=256, padding='same', sr=sr, n_mels=64, fmin=125, fmax=7500, power_melgram=1.0, return_decibel_melgram=True, name='trainable_stft')(x) if random_noise: x = AdditiveNoise(power=random_noise, random_gain=True)(x) x = Normalization2D(str_axis='freq')(x) x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)), name="transpose")(x) vggish = VGGish(include_top=False, load_weights=weights, input_shape=x.get_shape().as_list()[1:], pooling=None) if weights is not None: # only freeze when using pretrained layers for layer in vggish.layers: layer.trainable = False x = vggish(x) x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x) x = keras.layers.Reshape(target_shape=(-1, 512))(x) outputs = rnn_classifier_branch(x, name='rnn', dropout=rnn_dropout, dropout_final=dropout_final, rnn_units=rnn_units, rnn_type=rnn_type, n_classes=len(classes), rnn_layers=rnn_layers) model = keras.Model(inputs=inputs, outputs=outputs, name='crnn') model.summary() return model, vggish
def _construct_sparsified_audio_network(**kwargs): """ Returns an uninitialized model object for a sparsified network with a Melspectrogram input (with 256 frequency bins). Returns ------- model : keras.models.Model Model object. """ weight_decay = 1e-5 n_dft = 2048 n_mels = 256 n_hop = 242 asr = 48000 audio_window_dur = 1 # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # MELSPECTROGRAM PREPROCESSING y_a = Melspectrogram( n_dft=n_dft, n_hop=n_hop, n_mels=n_mels, sr=asr, power_melgram=1.0, htk=True, # n_win=n_win, return_decibel_melgram=True, padding='same')(x_a) y_a = BatchNormalization()(y_a) # CONV BLOCK 1 n_filter_a_1 = 64 filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128 filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256 filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512 filt_size_a_4 = (3, 3) pool_size_a_4 = (32, 24) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3]) #(32, 24) y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a) y_a = Flatten()(y_a) m = Model(inputs=x_a, outputs=y_a) return m
def _construct_ust_specialized_audio_network(emb_dim=128, **kwargs): """ Returns an uninitialized model object for a UST specialized audio network with a Melspectrogram input (with 64 frequency bins). Returns ------- model : keras.models.Model Model object. """ weight_decay = 1e-5 n_dft = 1024 # original L3 has 2048 n_mels = 64 # original L3 has 256 n_hop = 160 # original L3 has 242 asr = 8000 # original L3 has 48000 audio_window_dur = 1 # reduce the number of conv filters in each conv block according to the emb_dim given reduction_factor = { 512: [1, 1, 1, 1], 256: [2, 2, 2, 2], 128: [2, 2, 2, 4], 64: [2, 2, 2, 8] } # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # MELSPECTROGRAM PREPROCESSING y_a = Melspectrogram( n_dft=n_dft, n_hop=n_hop, n_mels=n_mels, sr=asr, power_melgram=1.0, htk=True, # n_win=n_win, return_decibel_melgram=True, padding='same')(x_a) y_a = BatchNormalization()(y_a) # CONV BLOCK 1 n_filter_a_1 = 64 // reduction_factor[emb_dim][0] filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128 // reduction_factor[emb_dim][1] filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256 // reduction_factor[emb_dim][2] filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512 // reduction_factor[emb_dim][3] filt_size_a_4 = (3, 3) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3]) #(32, 24) y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a) y_a = Flatten()(y_a) m = Model(inputs=x_a, outputs=y_a) return m