Esempio n. 1
0
def test_mfcc_correctness(data_format, n_mfccs):
    src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1)
    melgram = librosa.power_to_db(librosa.feature.melspectrogram(src_mono))  # mel, time

    mfcc_ref = librosa.feature.mfcc(
        S=melgram, n_mfcc=n_mfccs, norm='ortho'
    )  # 'ortho' -> 5% mismatch but..
    expand_dim = (0, 3) if data_format in (_CH_LAST_STR, _CH_DEFAULT_STR) else (0, 1)

    melgram_batch = np.expand_dims(melgram.T, expand_dim)

    model = tf.keras.Sequential()
    model.add(
        LogmelToMFCC(n_mfccs=n_mfccs, data_format=data_format, input_shape=melgram_batch.shape[1:])
    )

    mfcc_kapre = model.predict(melgram_batch)
    ch_axis = 1 if data_format == _CH_FIRST_STR else 3
    mfcc_kapre = np.squeeze(mfcc_kapre, axis=ch_axis)
    mfcc_kapre = mfcc_kapre[0].T

    if n_mfccs > 1:
        np.testing.assert_allclose(mfcc_ref[1:], mfcc_kapre[1:], atol=1e-4)

    np.testing.assert_allclose(mfcc_ref[0], mfcc_kapre[0] / np.sqrt(2.0), atol=1e-4)
Esempio n. 2
0
def test_frame_correctness(frame_length, data_format):
    hop_length = frame_length // 2
    n_ch = 1
    src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch, length=1000)

    model = tf.keras.Sequential()
    model.add(
        Frame(
            frame_length=frame_length,
            hop_length=hop_length,
            pad_end=False,
            data_format=data_format,
            input_shape=input_shape,
        )
    )

    frames_ref = librosa.util.frame(src_mono, frame_length, hop_length).T  # (time, frame_length)

    if data_format in (_CH_DEFAULT_STR, _CH_LAST_STR):
        frames_ref = np.expand_dims(frames_ref, axis=2)
    else:
        frames_ref = np.expand_dims(frames_ref, axis=0)

    frames_kapre = model.predict(batch_src)[0]

    np.testing.assert_equal(frames_kapre, frames_ref)
Esempio n. 3
0
def test_energy_correctness(data_format):
    frame_length = 4
    hop_length = frame_length // 2
    n_ch = 1
    src_mono, batch_src, input_shape = get_audio(
        data_format=data_format, n_ch=n_ch, length=frame_length * 2
    )

    sr = 22050
    ref_duration = 0.1
    model = tf.keras.Sequential()
    model.add(
        Energy(
            sample_rate=sr,
            ref_duration=ref_duration,
            frame_length=frame_length,
            hop_length=hop_length,
            pad_end=False,
            data_format=data_format,
            input_shape=input_shape,
        )
    )

    energies_kapre = model.predict(batch_src)[0]

    frames_ref = librosa.util.frame(src_mono, frame_length, hop_length).T  # (time, frame_length)
    nor_coeff = ref_duration / (frame_length / sr)
    energies_ref = nor_coeff * np.sum(frames_ref ** 2, axis=1)  # (time, )

    if data_format in (_CH_DEFAULT_STR, _CH_LAST_STR):
        energies_ref = np.expand_dims(energies_ref, axis=1)
    else:
        energies_ref = np.expand_dims(energies_ref, axis=0)

    np.testing.assert_allclose(energies_kapre, energies_ref, atol=1e-5)
Esempio n. 4
0
def test_save_load():
    """test saving/loading of models that has stft, melspectorgrma, and log frequency."""

    src_mono, batch_src, input_shape = get_audio(data_format='channels_last',
                                                 n_ch=1)
    # test STFT save/load
    save_load_compare(STFT(input_shape=input_shape, pad_begin=True), batch_src,
                      allclose_complex_numbers)
    # test melspectrogram save/load
    save_load_compare(
        get_melspectrogram_layer(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test log frequency spectrogram save/load
    save_load_compare(
        get_log_frequency_spectrogram_layer(input_shape=input_shape,
                                            return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test stft_mag_phase
    save_load_compare(
        get_stft_mag_phase(input_shape=input_shape, return_decibel=True),
        batch_src,
        np.testing.assert_allclose,
    )
    # test stft mag
    save_load_compare(get_stft_magnitude_layer(input_shape=input_shape),
                      batch_src, np.testing.assert_allclose)
Esempio n. 5
0
def test_save_load(data_format):
    src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1)
    # test Frame save/load
    save_load_compare(
        Frame(frame_length=128, hop_length=64, input_shape=input_shape),
        batch_src,
        np.testing.assert_allclose,
    )
    # test Energy save/load
    save_load_compare(
        Energy(frame_length=128, hop_length=64, input_shape=input_shape),
        batch_src,
        np.testing.assert_allclose,
    )
    # test mu law layers
    save_load_compare(
        MuLawEncoding(quantization_channels=128),
        batch_src,
        np.testing.assert_allclose,
    )
    save_load_compare(
        MuLawDecoding(quantization_channels=128),
        np.arange(0, 256, 1).reshape((1, 256, 1)),
        np.testing.assert_allclose,
    )
    # test mfcc layer
    expand_dim = (0, 3) if data_format in (_CH_LAST_STR, _CH_DEFAULT_STR) else (0, 1)
    save_load_compare(
        LogmelToMFCC(n_mfccs=10),
        np.expand_dims(librosa.power_to_db(librosa.feature.melspectrogram(src_mono).T), expand_dim),
        np.testing.assert_allclose,
    )
Esempio n. 6
0
def test_log_spectrogram_fail():
    """test if log spectrogram layer works well"""
    src_mono, batch_src, input_shape = get_audio(data_format='channels_last',
                                                 n_ch=1)
    _ = get_log_frequency_spectrogram_layer(input_shape,
                                            return_decibel=True,
                                            log_n_bins=200)
Esempio n. 7
0
def test_mag_phase(data_format):
    n_ch = 1
    n_fft, hop_length, win_length = 512, 256, 512

    src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch)

    mag_phase_layer = get_stft_mag_phase(
        input_shape=input_shape,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        input_data_format=data_format,
        output_data_format=data_format,
    )
    model = tensorflow.keras.models.Sequential()
    model.add(mag_phase_layer)
    mag_phase_kapre = model(batch_src)[0]  # a 2d image shape

    ch_axis = 0 if data_format == 'channels_first' else 2  # non-batch
    mag_phase_ref = np.stack(
        librosa.magphase(
            librosa.stft(
                src_mono, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False,
            ).T
        ),
        axis=ch_axis,
    )
    np.testing.assert_equal(mag_phase_kapre.shape, mag_phase_ref.shape)
    # magnitude test
    np.testing.assert_allclose(
        np.take(mag_phase_kapre, [0,], axis=ch_axis,),
        np.take(mag_phase_ref, [0,], axis=ch_axis,),
        atol=2e-4,
    )
Esempio n. 8
0
def placeholder(*args, **kwargs):
    global root, voices_menu
    words = []
    while True:
        try:
            word = input('type text to search:\n')
            if not word:
                break
            words.append(word)
        except KeyboardInterrupt:
            break
    print('fetching...')
    voices = SelectionMenu([])
    i = 0
    for text, id in get_audio(words):
        i += 1
        sel = SelectionMenu([], 'Choose phrase')
        si = SubmenuItem(text[:60], sel, voices)
        sel.append_item(FunctionItem(f"Play \"{text[:80]}\"",
                                     play_wrapper(id)))
        sel.append_item(FunctionItem(f"Save \"{text[:80]}\"",
                                     save_wrapper(id)))
        voices.append_item(si)
    voices.title = f'Tip for long outputs: to exit press `1` and arrow up'
    string = '+'.join(words)
    submenu_item = SubmenuItem(f'Found {string} voices: {i}', voices, root)

    root.append_item(submenu_item)
Esempio n. 9
0
def test_perfectly_reconstructing_stft_istft(waveform_data_format,
                                             stft_data_format, hop_ratio):
    n_ch = 1
    src_mono, batch_src, input_shape = get_audio(
        data_format=waveform_data_format, n_ch=n_ch)
    time_axis = 1 if waveform_data_format == 'channels_first' else 0  # non-batch!
    len_src = input_shape[time_axis]

    n_fft = 2048
    hop_length = int(2048 * hop_ratio)
    n_added_frames = int(1 / hop_ratio) - 1

    stft, istft = get_perfectly_reconstructing_stft_istft(
        stft_input_shape=input_shape,
        n_fft=n_fft,
        hop_length=hop_length,
        waveform_data_format=waveform_data_format,
        stft_data_format=stft_data_format,
    )
    # Test - [STFT -> ISTFT]
    model = tf.keras.models.Sequential([stft, istft])

    recon_waveform = model(batch_src)

    # trim off the pad_begin part
    len_pad_begin = n_fft - hop_length
    if waveform_data_format == 'channels_first':
        recon_waveform = recon_waveform[:, :,
                                        len_pad_begin:len_pad_begin + len_src]
    else:
        recon_waveform = recon_waveform[:, len_pad_begin:len_pad_begin +
                                        len_src, :]

    np.testing.assert_allclose(batch_src, recon_waveform, atol=1e-5)

    # Test - [ISTFT -> STFT]
    S = librosa.stft(src_mono, n_fft=n_fft, hop_length=hop_length).T.astype(
        np.complex64)  # (time, freq)

    ch_axis = 1 if stft_data_format == 'channels_first' else 3  # batch shape
    S = np.expand_dims(S, (0, ch_axis))
    model = tf.keras.models.Sequential([istft, stft])
    recon_S = model(S)

    # trim off the frames coming from zero-pad result
    n = n_added_frames
    n_added_frames += n
    if stft_data_format == 'channels_first':
        if n != 0:
            S = S[:, :, n:-n, :]
        recon_S = recon_S[:, :, n_added_frames:-n_added_frames, :]
    else:
        if n != 0:
            S = S[:, n:-n, :, :]
        recon_S = recon_S[:, n_added_frames:-n_added_frames, :, :]

    np.testing.assert_equal(S.shape, recon_S.shape)
    allclose_complex_numbers(S, recon_S)
Esempio n. 10
0
def test_spectrogram_correctness_more(data_format, window_name):
    def _get_stft_model(following_layer=None):
        # compute with kapre
        stft_model = tensorflow.keras.models.Sequential()
        stft_model.add(
            STFT(
                n_fft=n_fft,
                win_length=win_length,
                hop_length=hop_length,
                window_name=window_name,
                pad_end=False,
                input_data_format=data_format,
                output_data_format=data_format,
                input_shape=input_shape,
                name='stft',
            )
        )
        if following_layer is not None:
            stft_model.add(following_layer)
        return stft_model

    n_fft = 512
    hop_length = 256
    n_ch = 2

    src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch)
    win_length = n_fft  # test with x2
    # compute with librosa
    S_ref = librosa.core.stft(
        src_mono,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        center=False,
        window=window_name.replace('_window', '') if window_name else 'hann',
    ).T  # (time, freq)

    S_ref = np.expand_dims(S_ref, axis=2)  # time, freq, ch=1
    S_ref = np.tile(S_ref, [1, 1, n_ch])  # time, freq, ch=n_ch
    if data_format == 'channels_first':
        S_ref = np.transpose(S_ref, (2, 0, 1))  # ch, time, freq

    stft_model = _get_stft_model()

    S_complex = stft_model.predict(batch_src)[0]  # 3d representation
    allclose_complex_numbers(S_ref, S_complex)

    # test Magnitude()
    stft_mag_model = _get_stft_model(Magnitude())
    S = stft_mag_model.predict(batch_src)[0]  # 3d representation
    np.testing.assert_allclose(np.abs(S_ref), S, atol=2e-4)

    # # test Phase()
    stft_phase_model = _get_stft_model(Phase())
    S = stft_phase_model.predict(batch_src)[0]  # 3d representation
    allclose_phase(np.angle(S_complex), S)
Esempio n. 11
0
def test_save_load(save_format):
    """test saving/loading of models that has stft, melspectorgrma, and log frequency."""

    src_mono, batch_src, input_shape = get_audio(data_format='channels_last',
                                                 n_ch=1)
    # test STFT save/load
    save_load_compare(
        STFT(input_shape=input_shape, pad_begin=True),
        batch_src,
        allclose_complex_numbers,
        save_format,
        STFT,
    )

    # test ConcatenateFrequencyMap
    specs_batch = np.random.randn(2, 3, 5, 4).astype(np.float32)
    save_load_compare(
        ConcatenateFrequencyMap(input_shape=specs_batch.shape[1:]),
        specs_batch,
        np.testing.assert_allclose,
        save_format,
        ConcatenateFrequencyMap,
    )

    if save_format == 'tf':
        # test melspectrogram save/load
        save_load_compare(
            get_melspectrogram_layer(input_shape=input_shape,
                                     return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test log frequency spectrogram save/load
        save_load_compare(
            get_log_frequency_spectrogram_layer(input_shape=input_shape,
                                                return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test stft_mag_phase
        save_load_compare(
            get_stft_mag_phase(input_shape=input_shape, return_decibel=True),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
        # test stft mag
        save_load_compare(
            get_stft_magnitude_layer(input_shape=input_shape),
            batch_src,
            np.testing.assert_allclose,
            save_format,
        )
Esempio n. 12
0
def test_save_load_channel_swap(data_format, save_format):
    src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1)

    save_load_compare(
        ChannelSwap(input_shape=input_shape),
        batch_src,
        np.testing.assert_allclose,
        save_format=save_format,
        layer_class=ChannelSwap,
        training=None,
    )
Esempio n. 13
0
def test_channel_swap_correctness(n_ch, data_format, data_type):
    len_src = 256
    src_mono, batch_src, input_shape = get_audio(data_format=data_format,
                                                 n_ch=n_ch,
                                                 length=len_src)

    model = tf.keras.Sequential()
    model.add(ChannelSwap(input_shape=input_shape, ))
    # consistent during inference
    kapre_ref = model.predict(batch_src)
    for _ in range(100):
        kapre_again = model.predict(batch_src)
        np.testing.assert_equal(kapre_ref, kapre_again)
    ch_axis = 1 if data_format == _CH_FIRST_STR else 2  # to be changed for 2d data type
Esempio n. 14
0
def test_log_spectrogram_runnable(data_format):
    """test if log spectrogram layer works well"""
    src_mono, batch_src, input_shape = get_audio(data_format=data_format,
                                                 n_ch=1)
    _ = get_log_frequency_spectrogram_layer(input_shape, return_decibel=True)
    _ = get_log_frequency_spectrogram_layer(input_shape, return_decibel=False)
Esempio n. 15
0
def test_melspectrogram_correctness(n_fft, sr, hop_length, n_ch, data_format,
                                    amin, dynamic_range, n_mels, mel_f_min,
                                    mel_f_max):
    """Test the correctness of melspectrogram.

    Note that mel filterbank is tested separated

    """
    def _get_melgram_model(return_decibel,
                           amin,
                           dynamic_range,
                           input_shape=None):
        # compute with kapre
        melgram_model = get_melspectrogram_layer(
            n_fft=n_fft,
            sample_rate=sr,
            n_mels=n_mels,
            mel_f_min=mel_f_min,
            mel_f_max=mel_f_max,
            win_length=win_length,
            hop_length=hop_length,
            input_data_format=data_format,
            output_data_format=data_format,
            return_decibel=return_decibel,
            input_shape=input_shape,
            db_amin=amin,
            db_dynamic_range=dynamic_range,
        )
        return melgram_model

    src_mono, batch_src, input_shape = get_audio(data_format=data_format,
                                                 n_ch=n_ch)

    win_length = n_fft  # test with x2
    # compute with librosa
    S_ref = librosa.feature.melspectrogram(
        src_mono,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        center=False,
        power=1.0,
        n_mels=n_mels,
        fmin=mel_f_min,
        fmax=mel_f_max,
    ).T

    S_ref = np.expand_dims(S_ref, axis=2)  # time, freq, ch=1
    S_ref = np.tile(S_ref, [1, 1, n_ch])  # time, freq, ch=n_ch

    if data_format == 'channels_first':
        S_ref = np.transpose(S_ref, (2, 0, 1))  # ch, time, freq

    # melgram
    melgram_model = _get_melgram_model(return_decibel=False,
                                       input_shape=input_shape,
                                       amin=None,
                                       dynamic_range=120.0)
    S = melgram_model.predict(batch_src)[0]  # 3d representation
    np.testing.assert_allclose(S_ref, S, atol=1e-4)

    # log melgram
    melgram_model = _get_melgram_model(return_decibel=True,
                                       input_shape=input_shape,
                                       amin=amin,
                                       dynamic_range=dynamic_range)
    S = melgram_model.predict(batch_src)[0]  # 3d representation
    S_ref_db = librosa.power_to_db(S_ref,
                                   ref=1.0,
                                   amin=amin,
                                   top_db=dynamic_range)

    np.testing.assert_allclose(
        S_ref_db, S, rtol=3e-3)  # decibel is evaluated with relative tolerance
Esempio n. 16
0
def test_spectrogram_tflite_correctness(
    n_fft, hop_length, n_ch, data_format, batch_size, win_length, pad_end
):
    def _get_stft_model(following_layer=None, tflite_compatible=False):
        # compute with kapre
        stft_model = tensorflow.keras.models.Sequential()
        if tflite_compatible:
            stft_model.add(
                STFTTflite(
                    n_fft=n_fft,
                    win_length=win_length,
                    hop_length=hop_length,
                    window_name=None,
                    pad_end=pad_end,
                    input_data_format=data_format,
                    output_data_format=data_format,
                    input_shape=input_shape,
                    name='stft',
                )
            )
        else:
            stft_model.add(
                STFT(
                    n_fft=n_fft,
                    win_length=win_length,
                    hop_length=hop_length,
                    window_name=None,
                    pad_end=pad_end,
                    input_data_format=data_format,
                    output_data_format=data_format,
                    input_shape=input_shape,
                    name='stft',
                )
            )
        if following_layer is not None:
            stft_model.add(following_layer)
        return stft_model

    src_mono, batch_src, input_shape = get_audio(
        data_format=data_format, n_ch=n_ch, batch_size=batch_size
    )
    # tflite requires a known batch size
    batch_size = batch_src.shape[0]

    stft_model_tflite = _get_stft_model(tflite_compatible=True)
    stft_model = _get_stft_model(tflite_compatible=False)

    # test STFT()
    S_complex_tflite = predict_using_tflite(stft_model_tflite, batch_src)  # predict using tflite
    # (batch, time, freq, chan, re/imag) - convert to complex number:
    S_complex_tflite = tf.complex(
        S_complex_tflite[..., 0], S_complex_tflite[..., 1]
    )  # (batch,time,freq,chan)
    S_complex = stft_model.predict(batch_src)  # predict using tf model
    allclose_complex_numbers(S_complex, S_complex_tflite)

    # test Magnitude()
    stft_mag_model_tflite = _get_stft_model(MagnitudeTflite(), tflite_compatible=True)
    stft_mag_model = _get_stft_model(Magnitude(), tflite_compatible=False)
    S_lite = predict_using_tflite(stft_mag_model_tflite, batch_src)  # predict using tflite
    S = stft_mag_model.predict(batch_src)  # predict using tf model
    np.testing.assert_allclose(S, S_lite, atol=1e-4)

    # # test approx Phase() same for tflite and non-tflite
    stft_approx_phase_model_lite = _get_stft_model(
        PhaseTflite(approx_atan_accuracy=500), tflite_compatible=True
    )
    stft_approx_phase_model = _get_stft_model(
        Phase(approx_atan_accuracy=500), tflite_compatible=False
    )
    S_approx_phase_lite = predict_using_tflite(
        stft_approx_phase_model_lite, batch_src
    )  # predict using tflite
    S_approx_phase = stft_approx_phase_model.predict(
        batch_src, batch_size=batch_size
    )  # predict using tf model
    assert_approx_phase(S_approx_phase_lite, S_approx_phase, atol=1e-2, acceptable_fail_ratio=0.01)

    # # test accuracy of approx Phase()
    stft_phase_model = _get_stft_model(Phase(), tflite_compatible=False)
    S_phase = stft_phase_model.predict(batch_src, batch_size=batch_size)  # predict using tf model
    assert_approx_phase(S_approx_phase_lite, S_phase, atol=1e-2, acceptable_fail_ratio=0.01)