def extract_openl3_features(paths, path2gt): """Extracts OpenL3 features and their corresponding ground_truth and identifiers (the path). OpenL3 features are extracted from non-overlapping audio patches of 1 second, where each audio patch covers 128 mel bands. We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features. """ model = openl3.models.load_embedding_model(input_repr="mel128", content_type="music", embedding_size=512) first_audio = True for p in paths: wave, sr = wavefile_to_waveform(config['audio_folder'] + p, 'openl3') emb, _ = openl3.get_embedding(wave, sr, hop_size=1, model=model, verbose=False) if first_audio: features = emb ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0) identifiers = np.repeat(p, features.shape[0], axis=0) first_audio = False else: features = np.concatenate((features, emb), axis=0) tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(p, emb.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) return [features, ground_truth, identifiers]
def extract_other_features(paths, path2gt, model_type): """Extracts MusiCNN or OpenL3 features and their corresponding ground_truth and identifiers (the path). OpenL3 features are extracted from non-overlapping audio patches of 1 second, where each audio patch covers 128 mel bands. MusiCNN features are extracted from non-overlapping audio patches of 1 second, where each audio patch covers 96 mel bands. We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features. """ if model_type == 'openl3': model = openl3.models.load_embedding_model(input_repr="mel128", content_type="music", embedding_size=512) first_audio = True for p in paths: if model_type == 'musicnn': taggram, tags, extracted_features = extractor( config['audio_folder'] + p, model='MSD_musicnn', extract_features=True, input_overlap=1) emb = extracted_features[ 'max_pool'] # or choose any other layer, for example: emb = taggram # Documentation: https://github.com/jordipons/musicnn/blob/master/DOCUMENTATION.md elif model_type == 'openl3': wave, sr = wavefile_to_waveform(config['audio_folder'] + p, 'openl3') emb, _ = openl3.get_embedding(wave, sr, hop_size=1, model=model, verbose=False) if first_audio: features = emb ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0) identifiers = np.repeat(p, features.shape[0], axis=0) first_audio = False else: features = np.concatenate((features, emb), axis=0) tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(p, emb.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) return [features, ground_truth, identifiers]
def test_get_embedding(): hop_size = 0.1 tol = 1e-5 # Make sure all embedding types work fine audio, sr = sf.read(CHIRP_MONO_PATH) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel128", content_type="music", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel128", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="linear", content_type="music", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) audio, sr = sf.read(CHIRP_MONO_PATH) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="linear", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="env", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="env", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel128", content_type="env", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel128", content_type="env", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="linear", content_type="env", embedding_size=512, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="linear", content_type="env", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) emb1, ts1 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 6144 assert not np.any(np.isnan(emb1)) # Make sure that the embeddings are approximately the same with mono and stereo audio, sr = sf.read(CHIRP_STEREO_PATH) emb2, ts2 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) # assert np.all(np.abs(emb1 - emb2) < tol) # assert np.all(np.abs(ts1 - ts2) < tol) assert not np.any(np.isnan(emb2)) # Make sure that the embeddings are approximately the same if we resample the audio audio, sr = sf.read(CHIRP_44K_PATH) emb3, ts3 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) # assert np.all(np.abs(emb1 - emb3) < tol) # assert np.all(np.abs(ts1 - ts3) < tol) assert not np.any(np.isnan(emb3)) # Make sure empty audio is handled audio, sr = sf.read(EMPTY_PATH) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) # Make sure user is warned when audio is too short audio, sr = sf.read(SHORT_PATH) pytest.warns(OpenL3Warning, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=False, hop_size=0.1, verbose=1) # Make sure short audio can be handled emb4, ts4 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=False, hop_size=0.1, verbose=1) assert emb4.shape[0] == 1 assert emb4.shape[1] == 6144 assert len(ts4) == 1 assert ts4[0] == 0 assert not np.any(np.isnan(emb4)) # Make sure silence is handled audio, sr = sf.read(SILENCE_PATH) pytest.warns(OpenL3Warning, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) emb5, ts5 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) assert emb5.shape[1] == 6144 assert not np.any(np.isnan(emb5)) # Check for centering audio, sr = sf.read(CHIRP_1S_PATH) emb6, ts6 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) n_frames = 1 + int((audio.shape[0] + 0.5 * sr - sr) / int(hop_size * sr)) assert emb6.shape[0] == n_frames emb7, ts7 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=False, hop_size=hop_size, verbose=1) n_frames = 1 + int((audio.shape[0] - sr) / int(hop_size * sr)) assert emb7.shape[0] == n_frames # Check for hop size hop_size = 0.2 emb8, ts8 = openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=1) n_frames = 1 + int((audio.shape[0] + 0.5 * sr - sr) / int(hop_size * sr)) assert emb8.shape[0] == n_frames # Make sure changing verbosity doesn't break openl3.get_embedding(audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=hop_size, verbose=0) # Make sure invalid arguments don't work pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="invalid", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="invalid", embedding_size=6144, center=True, hop_size=0.1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="invalid", embedding_size=42, center=True, hop_size=0.1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size="invalid", center=True, hop_size=0.1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=-1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=-1) pytest.raises(OpenL3Error, openl3.get_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=6144, center='invalid', hop_size=0.1, verbose=1) pytest.raises(OpenL3Error, openl3.get_embedding, np.ones((10, 10, 10)), sr, input_repr="mel256", content_type="music", embedding_size=6144, center=True, hop_size=0.1, verbose=1)