Esempio n. 1
0
def test_save_load():
    """test saving/loading of models that has stft, melspectorgrma, and log frequency."""

    src_mono, batch_src, input_shape = get_audio(data_format='channels_last',
                                                 n_ch=1)
    # test STFT save/load
    save_load_compare(STFT(input_shape=input_shape, pad_begin=True), batch_src,
                      allclose_complex_numbers)
    # test melspectrogram save/load
    save_load_compare(
        get_melspectrogram_layer(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test log frequency spectrogram save/load
    save_load_compare(
        get_log_frequency_spectrogram_layer(input_shape=input_shape,
                                            return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test stft_mag_phase
    save_load_compare(
        get_stft_mag_phase(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test stft mag
    save_load_compare(get_stft_magnitude_layer(input_shape=input_shape),
                      batch_src, np.testing.assert_allclose)
Esempio n. 2
0
def LSTM(N_CLASSES=10, SR=16000, DT=1.0):
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                     n_mels=128,
                                     pad_end=True,
                                     n_fft=512,
                                     win_length=400,
                                     hop_length=160,
                                     sample_rate=SR,
                                     return_decibel=True,
                                     input_data_format='channels_last',
                                     output_data_format='channels_last',
                                     name='lstm')
    x = LayerNormalization(axis=2, name='batch_norm')(i.output)
    x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x)
    s = TimeDistributed(layers.Dense(64, activation='tanh'),
                        name='td_dense_tanh')(x)
    x = layers.Bidirectional(layers.LSTM(32, return_sequences=True),
                             name='bidirectional_lstm')(s)
    x = layers.concatenate([s, x], axis=2, name='skip_connection')
    x = layers.Dense(64, activation='relu', name='dense_1_relu')(x)
    x = layers.MaxPooling1D(name='max_pool_1d')(x)
    x = layers.Dense(32, activation='relu', name='dense_2_relu')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(32, activation='relu',
                         activity_regularizer=l2(0.001),
                         name='dense_3_relu')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)
    model = Model(inputs=i.input, outputs=o, name='long_short_term_memory')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Esempio n. 3
0
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0):
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last')
    x = LayerNormalization(axis=2, name='batch_norm')(i.output)
    x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x)
    x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x)
    x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)
    model = Model(inputs=i.input, outputs=o, name='2d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model
Esempio n. 4
0
def Transformer(N_CLASSES=10, SR=16000, DT=1.0):

    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer
    
    # 128 originally
    n_mels = 64
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=n_mels,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last')
    x = LayerNormalization(axis=2, name='batch_norm')(i.output)
    x = tf.keras.layers.Reshape(x.shape[1:-1], input_shape=x.shape[1:])(x)

    encoder = Encoder(num_layers=2, d_model=n_mels, num_heads=num_heads,
                      dff=ff_dim, maximum_position_encoding=41000)
    t = encoder(x, training=False, mask=None)

    t = layers.Flatten()(t)
    outputs = layers.Dense(N_CLASSES, activation="softmax")(t)

    model = tf.keras.Model(inputs=i.input, outputs=outputs)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Esempio n. 5
0
def test_save_load(save_format):
    """test saving/loading of models that has stft, melspectorgrma, and log frequency."""

    src_mono, batch_src, input_shape = get_audio(data_format='channels_last',
                                                 n_ch=1)
    # test STFT save/load
    save_load_compare(
        STFT(input_shape=input_shape, pad_begin=True),
        batch_src,
        allclose_complex_numbers,
        save_format,
        STFT,
    )

    # test ConcatenateFrequencyMap
    specs_batch = np.random.randn(2, 3, 5, 4).astype(np.float32)
    save_load_compare(
        ConcatenateFrequencyMap(input_shape=specs_batch.shape[1:]),
        specs_batch,
        np.testing.assert_allclose,
        save_format,
        ConcatenateFrequencyMap,
    )

    if save_format == 'tf':
        # test melspectrogram save/load
        save_load_compare(
            get_melspectrogram_layer(input_shape=input_shape,
                                     return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test log frequency spectrogram save/load
        save_load_compare(
            get_log_frequency_spectrogram_layer(input_shape=input_shape,
                                                return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test stft_mag_phase
        save_load_compare(
            get_stft_mag_phase(input_shape=input_shape, return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test stft mag
        save_load_compare(
            get_stft_magnitude_layer(input_shape=input_shape),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
Esempio n. 6
0
 def __init__(self, input_shape, sr):
     self.layer = get_melspectrogram_layer(
         input_shape=input_shape,
         n_mels=128,
         pad_end=True,
         n_fft=512,
         win_length=400,
         hop_length=160,
         sample_rate=sr,
         return_decibel=True,
         input_data_format='channels_last',
         output_data_format='channels_last')
Esempio n. 7
0
    def get_conv1D_model(self, N_CLASSES=6, SR=16000, DT=1.0):
        # input shape (n, feat, channel)
        input_shape = (int(SR * DT), 1)
        i = get_melspectrogram_layer(input_shape=input_shape,
                                     n_mels=128,
                                     pad_end=True,
                                     n_fft=512,
                                     win_length=400,
                                     hop_length=160,
                                     sample_rate=SR,
                                     return_decibel=True,
                                     input_data_format='channels_last',
                                     output_data_format='channels_last')
        x = layers.LayerNormalization(axis=2, name='batch_norm')(i.output)
        x = layers.TimeDistributed(layers.Conv1D(8,
                                                 kernel_size=(4),
                                                 activation='tanh'),
                                   name='td_conv_1d_tanh')(x)
        x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x)
        x = layers.TimeDistributed(layers.Conv1D(16,
                                                 kernel_size=(4),
                                                 activation='relu'),
                                   name='td_conv_1d_relu_1')(x)
        x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x)
        x = layers.TimeDistributed(layers.Conv1D(32,
                                                 kernel_size=(4),
                                                 activation='relu'),
                                   name='td_conv_1d_relu_2')(x)
        x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x)
        x = layers.TimeDistributed(layers.Conv1D(64,
                                                 kernel_size=(4),
                                                 activation='relu'),
                                   name='td_conv_1d_relu_3')(x)

        x = layers.TimeDistributed(layers.Conv1D(128,
                                                 kernel_size=(4),
                                                 activation='relu'),
                                   name='td_conv_1d_relu_4')(x)
        x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
        x = layers.Dropout(rate=0.1, name='dropout')(x)
        x = layers.Dense(64,
                         activation='relu',
                         activity_regularizer=l2(0.001),
                         name='dense')(x)
        o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

        model = Model(inputs=i.input, outputs=o, name='1d_convolution')
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        return model
Esempio n. 8
0
def spectrogram(audio):
    layer = get_melspectrogram_layer(input_shape=AUDIO_SHAPE,
                                     n_mels=128,
                                     pad_end=True,
                                     n_fft=512,
                                     win_length=400,
                                     hop_length=160,
                                     sample_rate=SR,
                                     return_decibel=True,
                                     input_data_format='channels_last',
                                     output_data_format='channels_last')

    audio = np.expand_dims(audio, axis=1)
    audio = np.reshape(audio, (1, audio.shape[0], audio.shape[1]))
    spec = layer(audio)
    return spec
Esempio n. 9
0
 def _get_melgram_model(return_decibel, amin, dynamic_range, input_shape=None):
     # compute with kapre
     melgram_model = get_melspectrogram_layer(
         n_fft=n_fft,
         sample_rate=sr,
         n_mels=n_mels,
         mel_f_min=mel_f_min,
         mel_f_max=mel_f_max,
         win_length=win_length,
         hop_length=hop_length,
         input_data_format=data_format,
         output_data_format=data_format,
         return_decibel=return_decibel,
         input_shape=input_shape,
         db_amin=amin,
         db_dynamic_range=dynamic_range,
     )
     return melgram_model
Esempio n. 10
0
def test_save_load():
    """test saving/loading of models that has stft, melspectorgrma, and log frequency."""

    def _test(layer, input_batch, allclose_func, atol=1e-4):
        """test a model with `layer` with the given `input_batch`.
        The model prediction result is compared using `allclose_func` which may depend on the
        data type of the model output (e.g., float or complex).
        """
        model = tensorflow.keras.models.Sequential()
        model.add(layer)

        result_ref = model(input_batch)

        os_temp_dir = tempfile.gettempdir()
        model_temp_dir = tempfile.TemporaryDirectory(dir=os_temp_dir)
        model.save(filepath=model_temp_dir.name)

        new_model = tf.keras.models.load_model(model_temp_dir.name)
        result_new = new_model(input_batch)
        allclose_func(result_ref, result_new, atol)

        model_temp_dir.cleanup()

        return model

    src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1)
    # test STFT save/load
    _test(STFT(input_shape=input_shape), batch_src, allclose_complex_numbers)
    # test melspectrogram save/load
    _test(
        get_melspectrogram_layer(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test log frequency spectrogram save/load
    _test(
        get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
Esempio n. 11
0
D = librosa.amplitude_to_db(np.abs(librosa.stft(src, hop_length=hop_length)),
                            ref=np.max)
librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr)

_src = src[:int(sr * dt)]
src = np.expand_dims(_src, axis=1)
input_shape = src.shape
print(input_shape)
# -

melgram = get_melspectrogram_layer(input_shape=input_shape,
                                   n_mels=128,
                                   mel_norm='slaney',
                                   pad_end=True,
                                   n_fft=512,
                                   win_length=400,
                                   hop_length=160,
                                   sample_rate=sr,
                                   db_ref_value=1.0,
                                   return_decibel=True,
                                   input_data_format='channels_last',
                                   output_data_format='channels_last')
norm = LayerNormalization(axis=2)
melgram.shape = (16000, 1)
model = Sequential()
model.add(melgram)
model.add(norm)
model.summary()

# +
batch = np.expand_dims(src, axis=0)
X = model.predict(batch).squeeze().T
Esempio n. 12
0
def build_vit(hp):

    dropout = hp.Float('dropout', 0.05, 0.4, sampling='log')
    image_size = 128  # We'll resize input images to this size
    patch_size = hp.Int(
        'transformer_layers', 4, 8,
        step=2)  # Size of the patches to be extract from the input images
    num_patches = (image_size // patch_size)**2
    projection_dim = hp.Int('projection_dim', 8, 40, step=8)
    num_heads = hp.Int('attention_heads', 2, 8, step=2)
    transformer_units = [
        projection_dim * 2,
        projection_dim,
    ]  # Size of the transformer layers
    transformer_layers = hp.Int('transformer_layers', 2, 6, step=1)
    mlp_head_units = hp.Int('mlp_head_units', 10, 20, step=2)

    input_shape = (int(sr * dt), 1)

    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=sr,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last')
    x = LayerNormalization(axis=2, name='batch_norm')(i.output)

    # Augment data.
    # augmented = data_augmentation(inputs)

    resize = layers.experimental.preprocessing.Resizing(
        image_size, image_size)(x)

    # Create patches.
    patches = Patches(patch_size)(resize)

    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(num_heads=num_heads,
                                                     key_dim=projection_dim,
                                                     dropout=dropout)(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=dropout)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    #Add Pooling for dimensionality reduction
    representation = tf.keras.layers.Reshape(target_shape=tf.expand_dims(
        representation, axis=-1).shape[1:])(representation)
    pooling = layers.MaxPool2D()(representation)

    representation = layers.Flatten()(pooling)

    # Add MLP.
    features = mlp(representation,
                   hidden_units=[mlp_head_units],
                   dropout_rate=dropout)

    # Classify outputs.
    logits = layers.Dense(N_CLASSES)(features)
    # Create the Keras model.
    model = keras.Model(inputs=i.input, outputs=logits)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model