def test_save_load(): """test saving/loading of models that has stft, melspectorgrma, and log frequency.""" src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test STFT save/load save_load_compare(STFT(input_shape=input_shape, pad_begin=True), batch_src, allclose_complex_numbers) # test melspectrogram save/load save_load_compare( get_melspectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test log frequency spectrogram save/load save_load_compare( get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test stft_mag_phase save_load_compare( get_stft_mag_phase(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test stft mag save_load_compare(get_stft_magnitude_layer(input_shape=input_shape), batch_src, np.testing.assert_allclose)
def LSTM(N_CLASSES=10, SR=16000, DT=1.0): input_shape = (int(SR*DT), 1) i = get_melspectrogram_layer(input_shape=input_shape, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=SR, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last', name='lstm') x = LayerNormalization(axis=2, name='batch_norm')(i.output) x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x) s = TimeDistributed(layers.Dense(64, activation='tanh'), name='td_dense_tanh')(x) x = layers.Bidirectional(layers.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s) x = layers.concatenate([s, x], axis=2, name='skip_connection') x = layers.Dense(64, activation='relu', name='dense_1_relu')(x) x = layers.MaxPooling1D(name='max_pool_1d')(x) x = layers.Dense(32, activation='relu', name='dense_2_relu')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(32, activation='relu', activity_regularizer=l2(0.001), name='dense_3_relu')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i.input, outputs=o, name='long_short_term_memory') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0): input_shape = (int(SR*DT), 1) i = get_melspectrogram_layer(input_shape=input_shape, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=SR, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') x = LayerNormalization(axis=2, name='batch_norm')(i.output) x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x) x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x) x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i.input, outputs=o, name='2d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def Transformer(N_CLASSES=10, SR=16000, DT=1.0): num_heads = 2 # Number of attention heads ff_dim = 32 # Hidden layer size in feed forward network inside transformer # 128 originally n_mels = 64 input_shape = (int(SR*DT), 1) i = get_melspectrogram_layer(input_shape=input_shape, n_mels=n_mels, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=SR, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') x = LayerNormalization(axis=2, name='batch_norm')(i.output) x = tf.keras.layers.Reshape(x.shape[1:-1], input_shape=x.shape[1:])(x) encoder = Encoder(num_layers=2, d_model=n_mels, num_heads=num_heads, dff=ff_dim, maximum_position_encoding=41000) t = encoder(x, training=False, mask=None) t = layers.Flatten()(t) outputs = layers.Dense(N_CLASSES, activation="softmax")(t) model = tf.keras.Model(inputs=i.input, outputs=outputs) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def test_save_load(save_format): """test saving/loading of models that has stft, melspectorgrma, and log frequency.""" src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test STFT save/load save_load_compare( STFT(input_shape=input_shape, pad_begin=True), batch_src, allclose_complex_numbers, save_format, STFT, ) # test ConcatenateFrequencyMap specs_batch = np.random.randn(2, 3, 5, 4).astype(np.float32) save_load_compare( ConcatenateFrequencyMap(input_shape=specs_batch.shape[1:]), specs_batch, np.testing.assert_allclose, save_format, ConcatenateFrequencyMap, ) if save_format == 'tf': # test melspectrogram save/load save_load_compare( get_melspectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test log frequency spectrogram save/load save_load_compare( get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test stft_mag_phase save_load_compare( get_stft_mag_phase(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test stft mag save_load_compare( get_stft_magnitude_layer(input_shape=input_shape), batch_src, np.testing.assert_allclose, save_format, )
def __init__(self, input_shape, sr): self.layer = get_melspectrogram_layer( input_shape=input_shape, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=sr, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last')
def get_conv1D_model(self, N_CLASSES=6, SR=16000, DT=1.0): # input shape (n, feat, channel) input_shape = (int(SR * DT), 1) i = get_melspectrogram_layer(input_shape=input_shape, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=SR, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') x = layers.LayerNormalization(axis=2, name='batch_norm')(i.output) x = layers.TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x) x = layers.TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x) x = layers.TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x) x = layers.TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_3')(x) x = layers.TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_4')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.1, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i.input, outputs=o, name='1d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def spectrogram(audio): layer = get_melspectrogram_layer(input_shape=AUDIO_SHAPE, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=SR, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') audio = np.expand_dims(audio, axis=1) audio = np.reshape(audio, (1, audio.shape[0], audio.shape[1])) spec = layer(audio) return spec
def _get_melgram_model(return_decibel, amin, dynamic_range, input_shape=None): # compute with kapre melgram_model = get_melspectrogram_layer( n_fft=n_fft, sample_rate=sr, n_mels=n_mels, mel_f_min=mel_f_min, mel_f_max=mel_f_max, win_length=win_length, hop_length=hop_length, input_data_format=data_format, output_data_format=data_format, return_decibel=return_decibel, input_shape=input_shape, db_amin=amin, db_dynamic_range=dynamic_range, ) return melgram_model
def test_save_load(): """test saving/loading of models that has stft, melspectorgrma, and log frequency.""" def _test(layer, input_batch, allclose_func, atol=1e-4): """test a model with `layer` with the given `input_batch`. The model prediction result is compared using `allclose_func` which may depend on the data type of the model output (e.g., float or complex). """ model = tensorflow.keras.models.Sequential() model.add(layer) result_ref = model(input_batch) os_temp_dir = tempfile.gettempdir() model_temp_dir = tempfile.TemporaryDirectory(dir=os_temp_dir) model.save(filepath=model_temp_dir.name) new_model = tf.keras.models.load_model(model_temp_dir.name) result_new = new_model(input_batch) allclose_func(result_ref, result_new, atol) model_temp_dir.cleanup() return model src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test STFT save/load _test(STFT(input_shape=input_shape), batch_src, allclose_complex_numbers) # test melspectrogram save/load _test( get_melspectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test log frequency spectrogram save/load _test( get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, )
D = librosa.amplitude_to_db(np.abs(librosa.stft(src, hop_length=hop_length)), ref=np.max) librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr) _src = src[:int(sr * dt)] src = np.expand_dims(_src, axis=1) input_shape = src.shape print(input_shape) # - melgram = get_melspectrogram_layer(input_shape=input_shape, n_mels=128, mel_norm='slaney', pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=sr, db_ref_value=1.0, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') norm = LayerNormalization(axis=2) melgram.shape = (16000, 1) model = Sequential() model.add(melgram) model.add(norm) model.summary() # + batch = np.expand_dims(src, axis=0) X = model.predict(batch).squeeze().T
def build_vit(hp): dropout = hp.Float('dropout', 0.05, 0.4, sampling='log') image_size = 128 # We'll resize input images to this size patch_size = hp.Int( 'transformer_layers', 4, 8, step=2) # Size of the patches to be extract from the input images num_patches = (image_size // patch_size)**2 projection_dim = hp.Int('projection_dim', 8, 40, step=8) num_heads = hp.Int('attention_heads', 2, 8, step=2) transformer_units = [ projection_dim * 2, projection_dim, ] # Size of the transformer layers transformer_layers = hp.Int('transformer_layers', 2, 6, step=1) mlp_head_units = hp.Int('mlp_head_units', 10, 20, step=2) input_shape = (int(sr * dt), 1) i = get_melspectrogram_layer(input_shape=input_shape, n_mels=128, pad_end=True, n_fft=512, win_length=400, hop_length=160, sample_rate=sr, return_decibel=True, input_data_format='channels_last', output_data_format='channels_last') x = LayerNormalization(axis=2, name='batch_norm')(i.output) # Augment data. # augmented = data_augmentation(inputs) resize = layers.experimental.preprocessing.Resizing( image_size, image_size)(x) # Create patches. patches = Patches(patch_size)(resize) # Encode patches. encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) # Create multiple layers of the Transformer block. for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=dropout)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=dropout) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) # Create a [batch_size, projection_dim] tensor. representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) #Add Pooling for dimensionality reduction representation = tf.keras.layers.Reshape(target_shape=tf.expand_dims( representation, axis=-1).shape[1:])(representation) pooling = layers.MaxPool2D()(representation) representation = layers.Flatten()(pooling) # Add MLP. features = mlp(representation, hidden_units=[mlp_head_units], dropout_rate=dropout) # Classify outputs. logits = layers.Dense(N_CLASSES)(features) # Create the Keras model. model = keras.Model(inputs=i.input, outputs=logits) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() return model