Esempio n. 1
0
def model_mfcc_layer(x_train, num_labels):
    sr = 22050
    model_input = x = Input(shape=x_train[0].shape)
    x = Melspectrogram(n_dft=512,
                       n_hop=sr // 128 + 1,
                       padding='same',
                       sr=sr,
                       n_mels=128,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='trainable_stft')(x)
    # x = Spectrogram(n_dft=512, n_hop=sr // 128 + 1,
    #       return_decibel_spectrogram=False, power_spectrogram=2.0,
    #       trainable_kernel=False, name='static_stft')(x)
    # x = Normalization2D(str_axis='freq')(x)
    # x = AdditiveNoise(power=0.3)(x)
    x = Conv2D(filters=16, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=32, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=64, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=128, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=256, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)

    x = AveragePooling2D(pool_size=(int(x.get_shape()[1]),
                                    int(x.get_shape()[2])))(x)

    x = Conv2D(filters=num_labels,
               kernel_size=1,
               padding='valid',
               activation='softmax' if num_labels > 1 else 'relu')(x)

    model = Model(inputs=[model_input], outputs=[x])

    model.summary()
    return model
Esempio n. 2
0
def build_model_vggish(classes,
                       dropout_final=0.2,
                       shape=(None, 320000),
                       sr=16000,
                       rnn_type='gru',
                       rnn_units=256,
                       focal_alpha=0.95,
                       rnn_layers=1,
                       rnn_dropout=0.2,
                       activation='elu',
                       random_noise=0.2,
                       weights='soundnet'):

    inputs = keras.Input(shape=shape[1:])
    x = keras.layers.Reshape(target_shape=(1, -1))(inputs)
    x = Melspectrogram(n_dft=512,
                       n_hop=256,
                       padding='same',
                       sr=sr,
                       n_mels=64,
                       fmin=125,
                       fmax=7500,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       name='trainable_stft')(x)
    if random_noise:
        x = AdditiveNoise(power=random_noise, random_gain=True)(x)
    x = Normalization2D(str_axis='freq')(x)
    x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)),
               name="transpose")(x)

    vggish = VGGish(include_top=False,
                    load_weights=weights,
                    input_shape=x.get_shape().as_list()[1:],
                    pooling=None)
    if weights is not None:  # only freeze when using pretrained layers
        for layer in vggish.layers:
            layer.trainable = False
    x = vggish(x)
    x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x)
    x = keras.layers.Reshape(target_shape=(-1, 512))(x)

    outputs = rnn_classifier_branch(x,
                                    name='rnn',
                                    dropout=rnn_dropout,
                                    dropout_final=dropout_final,
                                    rnn_units=rnn_units,
                                    rnn_type=rnn_type,
                                    n_classes=len(classes),
                                    rnn_layers=rnn_layers)

    model = keras.Model(inputs=inputs, outputs=outputs, name='crnn')

    model.summary()

    return model, vggish
Esempio n. 3
0
def _construct_sparsified_audio_network(**kwargs):
    """
    Returns an uninitialized model object for a sparsified network with a Melspectrogram input (with 256 frequency bins).

    Returns
    -------
    model : keras.models.Model
        Model object.

    """

    weight_decay = 1e-5
    n_dft = 2048
    n_mels = 256
    n_hop = 242
    asr = 48000
    audio_window_dur = 1

    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # MELSPECTROGRAM PREPROCESSING
    y_a = Melspectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        n_mels=n_mels,
        sr=asr,
        power_melgram=1.0,
        htk=True,  # n_win=n_win,
        return_decibel_melgram=True,
        padding='same')(x_a)
    y_a = BatchNormalization()(y_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    filt_size_a_4 = (3, 3)
    pool_size_a_4 = (32, 24)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer',
                 padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)

    pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3])  #(32, 24)
    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
    y_a = Flatten()(y_a)

    m = Model(inputs=x_a, outputs=y_a)

    return m
Esempio n. 4
0
def _construct_ust_specialized_audio_network(emb_dim=128, **kwargs):
    """
    Returns an uninitialized model object for a UST specialized audio network with a Melspectrogram input (with 64 frequency bins).

    Returns
    -------
    model : keras.models.Model
        Model object.

    """

    weight_decay = 1e-5
    n_dft = 1024  # original L3 has 2048
    n_mels = 64  # original L3 has 256
    n_hop = 160  # original L3 has 242
    asr = 8000  # original L3 has 48000
    audio_window_dur = 1

    # reduce the number of conv filters in each conv block according to the emb_dim given
    reduction_factor = {
        512: [1, 1, 1, 1],
        256: [2, 2, 2, 2],
        128: [2, 2, 2, 4],
        64: [2, 2, 2, 8]
    }

    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # MELSPECTROGRAM PREPROCESSING
    y_a = Melspectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        n_mels=n_mels,
        sr=asr,
        power_melgram=1.0,
        htk=True,  # n_win=n_win,
        return_decibel_melgram=True,
        padding='same')(x_a)
    y_a = BatchNormalization()(y_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64 // reduction_factor[emb_dim][0]
    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128 // reduction_factor[emb_dim][1]
    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256 // reduction_factor[emb_dim][2]
    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512 // reduction_factor[emb_dim][3]
    filt_size_a_4 = (3, 3)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer',
                 padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3])  #(32, 24)
    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
    y_a = Flatten()(y_a)

    m = Model(inputs=x_a, outputs=y_a)

    return m