def test_shapes_stereo(self):
        # Could do this for all models
        sounds = [torch.rand(48000, 2) for _ in range(3)]
        # Add silence to change the embeddings
        sounds[0][0:16000, :] = torch.zeros(16000, 2)
        sounds[1][16000:32000, :] = torch.zeros(16000, 2)
        sounds[2][32000:48000, :] = torch.zeros(16000, 2)
        emb0, ts0 = torchopenl3.get_audio_embedding(sounds,
                                                    48000,
                                                    batch_size=32,
                                                    sampler="resampy")

        emb0 = torch.stack(emb0)
        ts0 = np.vstack(ts0)
        assert emb0.shape == (3, 6, 6144)
        # assert ts0.shape == (3, 6, 1)

        emb1, ts1 = torchopenl3.get_audio_embedding(torch.stack(sounds),
                                                    48000,
                                                    batch_size=32,
                                                    sampler="resampy")
        assert emb1.shape == (3, 6, 6144)
        # assert ts1.shape == (3, 6, 1)

        assert torch.mean(torch.abs(emb1 - emb0)) <= 1e-6
 def check_model_for_regression(self, modelparams, filenames):
     audios = []
     srs = []
     for filename in filenames:
         audio, sr = sf.read(filename)
         audios.append(audio)
         srs.append(sr)
     n = len(filenames)
     embeddings0, ts0 = openl3.get_audio_embedding(audios,
                                                   srs,
                                                   batch_size=32,
                                                   **modelparams)
     embeddings1, ts1 = openl3.get_audio_embedding(audios,
                                                   srs,
                                                   batch_size=32,
                                                   **modelparams)
     # This is just a sanity check that openl3
     # gives consistent results, we can remove
     # it later.
     for i in range(n):
         assert np.mean(np.abs(embeddings1[i] - embeddings0[i])) <= 1e-6
         assert np.mean(np.abs(ts1[i] - ts0[i])) <= 1e-6
     embeddings2, ts2 = torchopenl3.get_audio_embedding(audios,
                                                        srs,
                                                        batch_size=32,
                                                        **modelparams)
     for i in range(n):
         '''
         We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving 
         more mean error. We can expect a prrety good result when we will pretrain model
         '''
         assert np.mean(np.abs(embeddings1[i] - embeddings2[i])) <= 2
         assert np.mean(np.abs(ts1[i] - ts2[i])) <= 2
Beispiel #3
0
    def test_timestamps_stereo(self):
        # Could do this for all models
        sounds = [torch.rand(48000, 2) for _ in range(3)]
        # Add silence to change the embeddings
        sounds[0][0:16000, :] = torch.zeros(16000, 2)
        sounds[1][16000:32000, :] = torch.zeros(16000, 2)
        sounds[2][32000:48000, :] = torch.zeros(16000, 2)

        for params in paramlist:
            emb0, ts0 = torchopenl3.get_audio_embedding(sounds,
                                                        48000,
                                                        batch_size=32,
                                                        sampler="resampy",
                                                        **params)
            ts0 = np.vstack(ts0)
            emb1, ts1 = torchopenl3.get_audio_embedding(torch.stack(sounds),
                                                        48000,
                                                        batch_size=32,
                                                        sampler="resampy",
                                                        **params)
            assert torch.mean(torch.abs(ts1 - ts0)) <= 1e-6
Beispiel #4
0
    def check_model_for_regression(self, modelparams, filenames):
        audios = []
        srs = []
        for filename in filenames:
            audio, sr = sf.read(filename)
            audios.append(audio)
            srs.append(sr)
        n = len(filenames)
        embeddings0, ts0 = openl3.get_audio_embedding(audios,
                                                      srs,
                                                      batch_size=32,
                                                      **modelparams)
        embeddings1, ts1 = openl3.get_audio_embedding(audios,
                                                      srs,
                                                      batch_size=32,
                                                      **modelparams)

        # This is just a sanity check that openl3
        # gives consistent results, we can remove
        # it later.
        for i in range(n):
            assert embeddings1[0].shape == embeddings0[0].shape
            assert embeddings1[1].shape == embeddings0[1].shape
            assert torch.mean(
                torch.abs(T(embeddings1[i]) - T(embeddings0[i]))) <= 1e-6
            assert torch.mean(torch.abs(T(ts1[i]) - T(ts0[i]))) <= 1e-6
        embeddings2, ts2 = torchopenl3.get_audio_embedding(audios,
                                                           srs,
                                                           batch_size=32,
                                                           sampler="resampy",
                                                           **modelparams)
        for i in range(n):
            """
            We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving
            more mean error. We can expect a prrety good result when we will pretrain model
            """
            print(embeddings1[0].shape, embeddings2[0].shape)
            print(embeddings1[1].shape, embeddings2[1].shape)
            print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))))
            print(torch.mean(torch.abs(T(embeddings1[i]) - T(embeddings2[i]))))
            print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))))
            assert embeddings1[0].shape == embeddings2[0].shape
            assert embeddings1[1].shape == embeddings2[1].shape
            assert torch.mean(
                torch.abs(T(embeddings1[i]) - T(embeddings2[i]))) <= 1e-2
            assert torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))) <= 1e-6
Beispiel #5
0
def test_get_audio_embedding():
    hop_size = 0.1
    tol = 1e-5

    # Make sure all embedding types work fine
    audio, sr = sf.read(CHIRP_MONO_PATH)
    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel256",
        content_type="music",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )

    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel256",
        content_type="music",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )

    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel128",
        content_type="music",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel128",
        content_type="music",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="linear",
        content_type="music",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    audio, sr = sf.read(CHIRP_MONO_PATH)
    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="linear",
        content_type="music",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel256",
        content_type="env",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel256",
        content_type="env",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel128",
        content_type="env",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="mel128",
        content_type="env",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="linear",
        content_type="env",
        embedding_size=6144,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = torchopenl3.get_audio_embedding(
        audio,
        sr,
        input_repr="linear",
        content_type="env",
        embedding_size=512,
        center=True,
        hop_size=hop_size,
        verbose=True,
    )
    emb1, ts1 = to_numpy(emb1), to_numpy(ts1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[-1] == 512
    assert not np.any(np.isnan(emb1))

    # Make sure we can load a model and pass it in
    model = torchopenl3.models.load_audio_embedding_model("linear", "env", 512)
    emb1load, ts1load = torchopenl3.get_audio_embedding(audio,
                                                        sr,
                                                        model=model,
                                                        center=True,
                                                        hop_size=hop_size,
                                                        verbose=True)
    emb1load, ts1load = to_numpy(emb1load), to_numpy(ts1load)
    assert np.all(np.abs(emb1load - emb1) < tol)
    assert np.all(np.abs(ts1load - ts1) < tol)

    # Make sure that the embeddings are approximately the same with mono and stereo
    audio, sr = sf.read(CHIRP_STEREO_PATH)
    emb2, ts2 = torchopenl3.get_audio_embedding(audio,
                                                sr,
                                                model=model,
                                                center=True,
                                                hop_size=0.1,
                                                verbose=True)
    emb2, ts2 = to_numpy(emb2), to_numpy(ts2)
    # assert np.all(np.abs(emb1 - emb2) < tol)
    # assert np.all(np.abs(ts1 - ts2) < tol)
    assert not np.any(np.isnan(emb2))

    # Make sure that the embeddings are approximately the same if we resample the audio
    audio, sr = sf.read(CHIRP_44K_PATH)
    emb3, ts3 = torchopenl3.get_audio_embedding(audio,
                                                sr,
                                                model=model,
                                                center=True,
                                                hop_size=0.1,
                                                verbose=True)
    emb3, ts3 = to_numpy(emb3), to_numpy(ts3)
    # assert np.all(np.abs(emb1 - emb3) < tol)
    # assert np.all(np.abs(ts1 - ts3) < tol)
    assert not np.any(np.isnan(emb3))

    # Check for centering
    audio, sr = sf.read(CHIRP_1S_PATH)
    emb6, _ = torchopenl3.get_audio_embedding(audio,
                                              sr,
                                              model=model,
                                              center=True,
                                              hop_size=hop_size,
                                              verbose=True)
    n_frames = 1 + int(
        (audio.shape[0] + sr // 2 - sr) / float(int(hop_size * sr)))
    assert emb6.shape[1] == n_frames

    emb7, _ = torchopenl3.get_audio_embedding(
        audio,
        sr,
        model=model,
        center=False,
        hop_size=hop_size,
        verbose=True,
    )
    n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size * sr)))
    assert emb7.shape[1] == n_frames

    # Check for hop size
    hop_size = 0.2
    emb8, _ = torchopenl3.get_audio_embedding(
        audio,
        sr,
        model=model,
        center=False,
        hop_size=hop_size,
        verbose=True,
    )
    n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size * sr)))
    assert emb8.shape[1] == n_frames

    # Check batch processing with multiple files with a single sample rate
    audio, sr = sf.read(CHIRP_MONO_PATH)
    hop_size = 0.1
    emb_list, ts_list = torchopenl3.get_audio_embedding(
        [audio, audio],
        sr,
        model=model,
        center=True,
        hop_size=hop_size,
        batch_size=4,
    )
    n_frames = 1 + int(
        (audio.shape[0] + sr // 2 - sr) / float(int(hop_size * sr)))
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert np.allclose(to_numpy(emb_list[0]), to_numpy(emb_list[1]))
    assert np.allclose(ts_list[0], ts_list[1])

    # Check batch processing with multiple files with individually given sample rates
    emb_list, ts_list = torchopenl3.get_audio_embedding(
        [audio, audio],
        [sr, sr],
        model=model,
        center=True,
        hop_size=hop_size,
        batch_size=4,
    )
    n_frames = 1 + int(
        (audio.shape[0] + sr // 2 - sr) / float(int(hop_size * sr)))
    assert type(emb_list) == list
    assert type(ts_list) == list
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert np.allclose(to_numpy(emb_list[0]), to_numpy(emb_list[1]))
    assert np.allclose(ts_list[0], ts_list[1])

    # Check batch processing with multiple files with different sample rates
    emb_list, ts_list = torchopenl3.get_audio_embedding(
        [audio, audio],
        [sr, sr / 2],
        model=model,
        center=True,
        hop_size=hop_size,
        batch_size=4,
    )
    n_frames = 1 + int(
        (audio.shape[0] + sr // 2 - sr) / float(int(hop_size * sr)))
    n_frames_2 = 1 + int(
        (audio.shape[0] + sr // 4 - sr / 2) / float(int(hop_size * sr / 2)))
    assert type(emb_list) == list
    assert type(ts_list) == list
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert emb_list[1].shape[0] == n_frames_2