Beispiel #1
0
def extract_openl3_features(paths, path2gt):
    """Extracts OpenL3 features and their corresponding ground_truth and identifiers (the path).

       OpenL3 features are extracted from non-overlapping audio patches of 1 second, 
       where each audio patch covers 128 mel bands.

       We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features.
    """
    model = openl3.models.load_embedding_model(input_repr="mel128",
                                               content_type="music",
                                               embedding_size=512)
    first_audio = True
    for p in paths:
        wave, sr = wavefile_to_waveform(config['audio_folder'] + p, 'openl3')
        emb, _ = openl3.get_embedding(wave,
                                      sr,
                                      hop_size=1,
                                      model=model,
                                      verbose=False)
        if first_audio:
            features = emb
            ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0)
            identifiers = np.repeat(p, features.shape[0], axis=0)
            first_audio = False
        else:
            features = np.concatenate((features, emb), axis=0)
            tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(p, emb.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    return [features, ground_truth, identifiers]
def extract_other_features(paths, path2gt, model_type):
    """Extracts MusiCNN or OpenL3 features and their corresponding ground_truth and identifiers (the path).

       OpenL3 features are extracted from non-overlapping audio patches of 1 second, 
       where each audio patch covers 128 mel bands.

       MusiCNN features are extracted from non-overlapping audio patches of 1 second, 
       where each audio patch covers 96 mel bands.

       We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features.
    """

    if model_type == 'openl3':
        model = openl3.models.load_embedding_model(input_repr="mel128",
                                                   content_type="music",
                                                   embedding_size=512)

    first_audio = True
    for p in paths:
        if model_type == 'musicnn':
            taggram, tags, extracted_features = extractor(
                config['audio_folder'] + p,
                model='MSD_musicnn',
                extract_features=True,
                input_overlap=1)
            emb = extracted_features[
                'max_pool']  # or choose any other layer, for example: emb = taggram
            # Documentation: https://github.com/jordipons/musicnn/blob/master/DOCUMENTATION.md
        elif model_type == 'openl3':
            wave, sr = wavefile_to_waveform(config['audio_folder'] + p,
                                            'openl3')
            emb, _ = openl3.get_embedding(wave,
                                          sr,
                                          hop_size=1,
                                          model=model,
                                          verbose=False)

        if first_audio:
            features = emb
            ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0)
            identifiers = np.repeat(p, features.shape[0], axis=0)
            first_audio = False
        else:
            features = np.concatenate((features, emb), axis=0)
            tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(p, emb.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    return [features, ground_truth, identifiers]
Beispiel #3
0
def test_get_embedding():
    hop_size = 0.1
    tol = 1e-5

    # Make sure all embedding types work fine
    audio, sr = sf.read(CHIRP_MONO_PATH)
    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel128",
                                     content_type="music",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel128",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="linear",
                                     content_type="music",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    audio, sr = sf.read(CHIRP_MONO_PATH)
    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="linear",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="env",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="env",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel128",
                                     content_type="env",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel128",
                                     content_type="env",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="linear",
                                     content_type="env",
                                     embedding_size=512,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="linear",
                                     content_type="env",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    emb1, ts1 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 6144
    assert not np.any(np.isnan(emb1))

    # Make sure that the embeddings are approximately the same with mono and stereo
    audio, sr = sf.read(CHIRP_STEREO_PATH)
    emb2, ts2 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=0.1,
                                     verbose=1)

    # assert np.all(np.abs(emb1 - emb2) < tol)
    # assert np.all(np.abs(ts1 - ts2) < tol)
    assert not np.any(np.isnan(emb2))

    # Make sure that the embeddings are approximately the same if we resample the audio
    audio, sr = sf.read(CHIRP_44K_PATH)
    emb3, ts3 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=0.1,
                                     verbose=1)

    # assert np.all(np.abs(emb1 - emb3) < tol)
    # assert np.all(np.abs(ts1 - ts3) < tol)
    assert not np.any(np.isnan(emb3))

    # Make sure empty audio is handled
    audio, sr = sf.read(EMPTY_PATH)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=0.1,
                  verbose=1)

    # Make sure user is warned when audio is too short
    audio, sr = sf.read(SHORT_PATH)
    pytest.warns(OpenL3Warning,
                 openl3.get_embedding,
                 audio,
                 sr,
                 input_repr="mel256",
                 content_type="music",
                 embedding_size=6144,
                 center=False,
                 hop_size=0.1,
                 verbose=1)

    # Make sure short audio can be handled
    emb4, ts4 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=False,
                                     hop_size=0.1,
                                     verbose=1)

    assert emb4.shape[0] == 1
    assert emb4.shape[1] == 6144
    assert len(ts4) == 1
    assert ts4[0] == 0
    assert not np.any(np.isnan(emb4))

    # Make sure silence is handled
    audio, sr = sf.read(SILENCE_PATH)
    pytest.warns(OpenL3Warning,
                 openl3.get_embedding,
                 audio,
                 sr,
                 input_repr="mel256",
                 content_type="music",
                 embedding_size=6144,
                 center=True,
                 hop_size=0.1,
                 verbose=1)

    emb5, ts5 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=0.1,
                                     verbose=1)
    assert emb5.shape[1] == 6144
    assert not np.any(np.isnan(emb5))

    # Check for centering
    audio, sr = sf.read(CHIRP_1S_PATH)
    emb6, ts6 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    n_frames = 1 + int((audio.shape[0] + 0.5 * sr - sr) / int(hop_size * sr))
    assert emb6.shape[0] == n_frames

    emb7, ts7 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=False,
                                     hop_size=hop_size,
                                     verbose=1)
    n_frames = 1 + int((audio.shape[0] - sr) / int(hop_size * sr))
    assert emb7.shape[0] == n_frames

    # Check for hop size

    hop_size = 0.2
    emb8, ts8 = openl3.get_embedding(audio,
                                     sr,
                                     input_repr="mel256",
                                     content_type="music",
                                     embedding_size=6144,
                                     center=True,
                                     hop_size=hop_size,
                                     verbose=1)
    n_frames = 1 + int((audio.shape[0] + 0.5 * sr - sr) / int(hop_size * sr))
    assert emb8.shape[0] == n_frames

    # Make sure changing verbosity doesn't break
    openl3.get_embedding(audio,
                         sr,
                         input_repr="mel256",
                         content_type="music",
                         embedding_size=6144,
                         center=True,
                         hop_size=hop_size,
                         verbose=0)

    # Make sure invalid arguments don't work
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="invalid",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=0.1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="invalid",
                  embedding_size=6144,
                  center=True,
                  hop_size=0.1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="invalid",
                  embedding_size=42,
                  center=True,
                  hop_size=0.1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size="invalid",
                  center=True,
                  hop_size=0.1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=0,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=-1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=0.1,
                  verbose=-1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  audio,
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center='invalid',
                  hop_size=0.1,
                  verbose=1)
    pytest.raises(OpenL3Error,
                  openl3.get_embedding,
                  np.ones((10, 10, 10)),
                  sr,
                  input_repr="mel256",
                  content_type="music",
                  embedding_size=6144,
                  center=True,
                  hop_size=0.1,
                  verbose=1)